## Import the required packages and libraries

In [8]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

In [9]:
appName = "Amazon alexa Review System"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## Load the Data from Kaggle

In [10]:
Amazon_csv = spark.read.csv('output.csv', inferSchema=True, header=True)
Amazon_csv.show(truncate=True, n=3)

+------+---------+----------------+--------------------+--------+
|rating|     date|       variation|    verified_reviews|feedback|
+------+---------+----------------+--------------------+--------+
|     5|31-Jul-18|Charcoal Fabric |       Love my Echo!|       1|
|     5|31-Jul-18|Charcoal Fabric |           Loved it!|       1|
|     4|31-Jul-18|  Walnut Finish |Sometimes while p...|       1|
+------+---------+----------------+--------------------+--------+
only showing top 3 rows



In [11]:
print((Amazon_csv.count(), len(Amazon_csv.columns)))

(3150, 5)


#### Check for missing values!!

In [12]:
Amazon_csv.select('feedback').distinct().collect()

[Row(feedback=' this would be a great gift. Once connected to wifi'),
 Row(feedback='0'),
 Row(feedback=' but getting the hang of it."'),
 Row(feedback=' it\'s pretty creepy"'),
 Row(feedback=' but hopefully soon Amazon allows the option to shut it off."'),
 Row(feedback=' gives you the wrong answer or finds information off of Wikipedia!"'),
 Row(feedback='1'),
 Row(feedback=' I can control my lights'),
 Row(feedback='  but I\'ll probably never use it"". Was I wrong!!  Echo is part of my daily routine')]

#### Checking the Schema

In [13]:
Amazon_csv.printSchema() 

root
 |-- rating: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- variation: string (nullable = true)
 |-- verified_reviews: string (nullable = true)
 |-- feedback: string (nullable = true)



In [14]:
Amazon_csv = Amazon_csv.filter((Amazon_csv.feedback==1) | (Amazon_csv.feedback==0))
Amazon_csv.show(truncate=True, n=10)

+------+---------+--------------------+--------------------+--------+
|rating|     date|           variation|    verified_reviews|feedback|
+------+---------+--------------------+--------------------+--------+
|     5|31-Jul-18|    Charcoal Fabric |       Love my Echo!|       1|
|     5|31-Jul-18|    Charcoal Fabric |           Loved it!|       1|
|     4|31-Jul-18|      Walnut Finish |Sometimes while p...|       1|
|     5|31-Jul-18|    Charcoal Fabric |I have had a lot ...|       1|
|     5|31-Jul-18|    Charcoal Fabric |               Music|       1|
|     5|31-Jul-18|Heather Gray Fabric |I received the ec...|       1|
|     3|31-Jul-18|   Sandstone Fabric |Without having a ...|       1|
|     5|31-Jul-18|    Charcoal Fabric |I think this is t...|       1|
|     5|30-Jul-18|Heather Gray Fabric |         looks great|       1|
|     5|30-Jul-18|Heather Gray Fabric |Love it! I’ve lis...|       1|
+------+---------+--------------------+--------------------+--------+
only showing top 10 

In [15]:
Amazon_csv.select('feedback').distinct().collect()

[Row(feedback='0'), Row(feedback='1')]

In [16]:
Amazon_csv_ = Amazon_csv.filter(col("verified_reviews").isNotNull()).select( "verified_reviews", "feedback")
Amazon_csv_.show(truncate=True, n=10)

+--------------------+--------+
|    verified_reviews|feedback|
+--------------------+--------+
|       Love my Echo!|       1|
|           Loved it!|       1|
|Sometimes while p...|       1|
|I have had a lot ...|       1|
|               Music|       1|
|I received the ec...|       1|
|Without having a ...|       1|
|I think this is t...|       1|
|         looks great|       1|
|Love it! I’ve lis...|       1|
+--------------------+--------+
only showing top 10 rows



In [17]:
Amazon_csv_.filter(Amazon_csv.verified_reviews==' ').show()

+----------------+--------+
|verified_reviews|feedback|
+----------------+--------+
|                |       1|
|                |       1|
|                |       1|
|                |       0|
|                |       0|
|                |       0|
|                |       1|
|                |       1|
|                |       1|
|                |       1|
|                |       0|
|                |       0|
|                |       0|
|                |       0|
|                |       1|
|                |       1|
|                |       1|
|                |       1|
|                |       1|
|                |       1|
+----------------+--------+
only showing top 20 rows



#### Summary of the Dataframe

In [18]:
Amazon_csv_ = Amazon_csv_.filter(Amazon_csv_.verified_reviews!=' ')

In [19]:
Amazon_csv_.show(truncate=True, n=10)

+--------------------+--------+
|    verified_reviews|feedback|
+--------------------+--------+
|       Love my Echo!|       1|
|           Loved it!|       1|
|Sometimes while p...|       1|
|I have had a lot ...|       1|
|               Music|       1|
|I received the ec...|       1|
|Without having a ...|       1|
|I think this is t...|       1|
|         looks great|       1|
|Love it! I’ve lis...|       1|
+--------------------+--------+
only showing top 10 rows



In [20]:
Amazon_csv_.filter(Amazon_csv.verified_reviews==' ').show()

+----------------+--------+
|verified_reviews|feedback|
+----------------+--------+
+----------------+--------+



In [21]:
Amazon_csv_.select('feedback').distinct().collect()

[Row(feedback='0'), Row(feedback='1')]

## split the dataset into train and test

In [22]:
dividedData = Amazon_csv_.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 2148 ; Testing data rows: 913


#### Tokenizing the Text

In [23]:
tokenizer = Tokenizer(inputCol="verified_reviews", outputCol="verified_reviews_words")
tokenizedTrain = tokenizer.transform(trainingData)
print(tokenizedTrain)

DataFrame[verified_reviews: string, feedback: string, verified_reviews_words: array<string>]


In [24]:
tokenizedTrain.printSchema()

root
 |-- verified_reviews: string (nullable = true)
 |-- feedback: string (nullable = true)
 |-- verified_reviews_words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [25]:
tokenizedTrain.show(truncate=True, n=10)

+--------------------+--------+----------------------+
|    verified_reviews|feedback|verified_reviews_words|
+--------------------+--------+----------------------+
|"Handy if you don...|       1|  ["handy, if, you,...|
|"I love my echo p...|       1|  ["i, love, my, ec...|
|"I loved how easy...|       1|  ["i, loved, how, ...|
|"I purchased the ...|       1|  ["i, purchased, t...|
|"Overall love it....|       1|  ["overall, love, ...|
|       ***Love it***|       1|      [***love, it***]|
|   3rd Dot. Love it!|       1|  [3rd, dot., love,...|
|4 out of 5 stars....|       1|  [4, out, of, 5, s...|
|4.5 out of 5 Star...|       1|  [4.5, out, of, 5,...|
|A GREAT PRODUCT.....|       1|  [a, great, produc...|
+--------------------+--------+----------------------+
only showing top 10 rows



In [26]:
tokenizedTrain.select('feedback').distinct().collect()

[Row(feedback='0'), Row(feedback='1')]

In [27]:
tokenizedTrain.show()

+--------------------+--------+----------------------+
|    verified_reviews|feedback|verified_reviews_words|
+--------------------+--------+----------------------+
|"Handy if you don...|       1|  ["handy, if, you,...|
|"I love my echo p...|       1|  ["i, love, my, ec...|
|"I loved how easy...|       1|  ["i, loved, how, ...|
|"I purchased the ...|       1|  ["i, purchased, t...|
|"Overall love it....|       1|  ["overall, love, ...|
|       ***Love it***|       1|      [***love, it***]|
|   3rd Dot. Love it!|       1|  [3rd, dot., love,...|
|4 out of 5 stars....|       1|  [4, out, of, 5, s...|
|4.5 out of 5 Star...|       1|  [4.5, out, of, 5,...|
|A GREAT PRODUCT.....|       1|  [a, great, produc...|
|         A great buy|       1|       [a, great, buy]|
|      A great device|       1|    [a, great, device]|
|A great product f...|       1|  [a, great, produc...|
|A great product. ...|       1|  [a, great, produc...|
|A helpful product...|       1|  [a, helpful, prod...|
|A small p

In [28]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=True, n=5)

+--------------------+--------+----------------------+--------------------+
|    verified_reviews|feedback|verified_reviews_words|     MeaningfulWords|
+--------------------+--------+----------------------+--------------------+
|"Handy if you don...|       1|  ["handy, if, you,...|["handy, expect, ...|
|"I love my echo p...|       1|  ["i, love, my, ec...|["i, love, echo, ...|
|"I loved how easy...|       1|  ["i, loved, how, ...|["i, loved, easy,...|
|"I purchased the ...|       1|  ["i, purchased, t...|["i, purchased, "...|
|"Overall love it....|       1|  ["overall, love, ...|["overall, love, ...|
+--------------------+--------+----------------------+--------------------+
only showing top 5 rows



## hash TF

In [29]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select('feedback', 'MeaningfulWords', 'features')
numericTrainData.show(truncate=True, n=10)

+--------+--------------------+--------------------+
|feedback|     MeaningfulWords|            features|
+--------+--------------------+--------------------+
|       1|["handy, expect, ...|(262144,[30234,76...|
|       1|["i, love, echo, ...|(262144,[329,1900...|
|       1|["i, loved, easy,...|(262144,[36702,40...|
|       1|["i, purchased, "...|(262144,[769,1558...|
|       1|["overall, love, ...|(262144,[3502,880...|
|       1|    [***love, it***]|(262144,[19700,20...|
|       1|[3rd, dot., love,...|(262144,[83671,90...|
|       1|[4, 5, stars., or...|(262144,[8254,912...|
|       1|[4.5, 5, stars., ...|(262144,[10564,14...|
|       1|[great, product.....|(262144,[13327,34...|
+--------+--------------------+--------------------+
only showing top 10 rows



In [30]:
numericTrainData.printSchema() 

root
 |-- feedback: string (nullable = true)
 |-- MeaningfulWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [31]:
numericTrainData.select('feedback').distinct().collect()

[Row(feedback='0'), Row(feedback='1')]

In [32]:
numericTrainData = numericTrainData.withColumn("feedback", 
                                  numericTrainData["feedback"]
                                  .cast('int'))

In [33]:
numericTrainData.printSchema() 

root
 |-- feedback: integer (nullable = true)
 |-- MeaningfulWords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



## Training the model

In [45]:
lr = LogisticRegression(labelCol="feedback", featuresCol="features", maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)  
print ("Training is done!")

Training is done!


In [46]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select('feedback', 'MeaningfulWords', 'features')
numericTest.show(truncate=True, n=10)

+--------+--------------------+--------------------+
|feedback|     MeaningfulWords|            features|
+--------+--------------------+--------------------+
|       1|["handy, expect, ...|(262144,[30234,76...|
|       1|["love, except, a...|(262144,[4125,119...|
|       1|["overall, love, ...|(262144,[3502,880...|
|       0|[&#34;never, buy,...|(262144,[8970,430...|
|       1| [2nd, one..., come]|(262144,[114763,1...|
|       1|[3rd, dot., love,...|(262144,[83671,90...|
|       1|[4.5, 5, stars., ...|(262144,[10564,14...|
|       1|     [great, device]|(262144,[82950,26...|
|       1|[great, investmen...|(262144,[57422,13...|
|       1|[great, investmen...|(262144,[57422,13...|
+--------+--------------------+--------------------+
only showing top 10 rows



In [47]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select("MeaningfulWords", "prediction", "feedback")
predictionFinal.show(n=10, truncate = True)

+--------------------+----------+--------+
|     MeaningfulWords|prediction|feedback|
+--------------------+----------+--------+
|["handy, expect, ...|       1.0|       1|
|["love, except, a...|       1.0|       1|
|["overall, love, ...|       1.0|       1|
|[&#34;never, buy,...|       1.0|       0|
| [2nd, one..., come]|       1.0|       1|
|[3rd, dot., love,...|       1.0|       1|
|[4.5, 5, stars., ...|       1.0|       1|
|     [great, device]|       1.0|       1|
|[great, investmen...|       1.0|       1|
|[great, investmen...|       1.0|       1|
+--------------------+----------+--------+
only showing top 10 rows



## Evaluating the Model

In [49]:

correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['feedback']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, ", accuracy:", correctPrediction/totalData)

correct prediction: 867 , total data: 913 , accuracy: 0.9496166484118291
