In [32]:
from textblob import TextBlob
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf

In [33]:
conf = SparkConf().setAppName("Sent Analysis Textblob")

In [34]:
sc = SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Sent Analysis Textblob, master=local[*]) created by __init__ at <ipython-input-3-66f9c693822e>:1 

In [35]:
sqlCtx = SQLContext(sc)

In [36]:
df = sqlCtx.read.json("kindle_store.json")

In [37]:
df

DataFrame[asin: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint]

In [38]:
reviews = df[["reviewText","overall"]]

In [17]:
reviews.show(20)

+--------------------+-------+
|          reviewText|overall|
+--------------------+-------+
|I enjoy vintage b...|    5.0|
|This book is a re...|    4.0|
|This was a fairly...|    4.0|
|I'd never read an...|    5.0|
|If you like perio...|    4.0|
|A beautiful in-de...|    4.0|
|I enjoyed this on...|    4.0|
|Never heard of Am...|    4.0|
|Darth Maul workin...|    5.0|
|This is a short s...|    4.0|
|I think I have th...|    5.0|
|Title has nothing...|    4.0|
|Well written. Int...|    3.0|
|Troy Denning's no...|    3.0|
|I am not for sure...|    5.0|
|I really enjoyed ...|    5.0|
|Great read enjoye...|    5.0|
|Another well writ...|    3.0|
|This one promises...|    5.0|
|I have a version ...|    4.0|
+--------------------+-------+
only showing top 20 rows



In [39]:
sqlCtx.registerDataFrameAsTable(reviews, "table2")

In [40]:
reviews1 = sqlCtx.sql("SELECT reviewText, overall from table2")

In [41]:
#positive->1
#neutral->0
#negative->2
def transform(star):
        if star >=3.0:
                return 1.0
        elif star == 3.0:
                return 0.0
        else:
                return 2.0

In [42]:
transformer = udf(transform)

In [43]:
df1 = reviews1.withColumn("label", transformer(reviews['overall']))

In [44]:
sqlCtx.registerDataFrameAsTable(df1, "table1")

In [45]:
df2 = sqlCtx.sql("SELECT reviewText, label from table1 WHERE reviewText != ''")

In [46]:
df2.show()

+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|I enjoy vintage b...|  1.0|
|This book is a re...|  1.0|
|This was a fairly...|  1.0|
|I'd never read an...|  1.0|
|If you like perio...|  1.0|
|A beautiful in-de...|  1.0|
|I enjoyed this on...|  1.0|
|Never heard of Am...|  1.0|
|Darth Maul workin...|  1.0|
|This is a short s...|  1.0|
|I think I have th...|  1.0|
|Title has nothing...|  1.0|
|Well written. Int...|  1.0|
|Troy Denning's no...|  1.0|
|I am not for sure...|  1.0|
|I really enjoyed ...|  1.0|
|Great read enjoye...|  1.0|
|Another well writ...|  1.0|
|This one promises...|  1.0|
|I have a version ...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [47]:
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0
    elif temp >= 0.0:
        return 1.0
    else:
        return 2.0

In [48]:
predictions = udf(apply_blob)

In [49]:
blob_df = df2.withColumn("predicted", predictions(df2['reviewText']))

In [50]:
blob_df.show()

+--------------------+-----+---------+
|          reviewText|label|predicted|
+--------------------+-----+---------+
|I enjoy vintage b...|  1.0|      1.0|
|This book is a re...|  1.0|      1.0|
|This was a fairly...|  1.0|      1.0|
|I'd never read an...|  1.0|      1.0|
|If you like perio...|  1.0|      1.0|
|A beautiful in-de...|  1.0|      1.0|
|I enjoyed this on...|  1.0|      1.0|
|Never heard of Am...|  1.0|      1.0|
|Darth Maul workin...|  1.0|      1.0|
|This is a short s...|  1.0|      1.0|
|I think I have th...|  1.0|      1.0|
|Title has nothing...|  1.0|      1.0|
|Well written. Int...|  1.0|      1.0|
|Troy Denning's no...|  1.0|      1.0|
|I am not for sure...|  1.0|      1.0|
|I really enjoyed ...|  1.0|      1.0|
|Great read enjoye...|  1.0|      1.0|
|Another well writ...|  1.0|      1.0|
|This one promises...|  1.0|      1.0|
|I have a version ...|  1.0|      2.0|
+--------------------+-----+---------+
only showing top 20 rows



In [51]:
true_labels = [i.label for i in blob_df.select("label").collect()]

In [52]:
predicted_labels = [i.predicted for i in blob_df.select("predicted").collect()]

In [53]:
correct = 0
wrong = 0

In [54]:
for i in range(len(true_labels)):
        if true_labels[i] == predicted_labels[i]:
                correct +=1
        else:
                wrong +=1

In [55]:
print('Correct predictions: ', correct)
print('Wrong predictions: ', wrong)
print('Accuracy: ', correct/(correct+wrong))

Correct predictions:  893520
Wrong predictions:  89077
Accuracy:  0.9093453368980365
