In [0]:
#RUN WITH SPARK SUBMIT
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType

from textblob import TextBlob



spark = SparkSession.builder.appName("PopularBooks").getOrCreate()

schema = StructType([StructField("postTitle", StringType(), True),
                     StructField("post", StringType(), True), 
                     StructField("score", StringType(), True), 
                     StructField("created_utc", StringType(), True), 
                     StructField("foundBook", StringType(), True)])


booksDF = spark.read.option("sep", ",").option("encoding", "utf-8").schema(schema).csv("/FileStore/tables/new_file_w_openlib.csv")
# booksDF.show(n=10)

topBookIDs = booksDF.groupBy("foundBook").count().orderBy(func.desc("count"))

# Grab the top 10
topBookIDs.show(n=10)

+------------+-----+
|   foundBook|count|
+------------+-----+
|        null| 3375|
| Looking for|  494|
|        I. M|  388|
|      I read|  336|
|      Book I|  218|
|      I need|  196|
|   That Book|  176|
|     Find Me|  163|
|Finding help|  160|
|      To get|  148|
+------------+-----+
only showing top 10 rows



In [0]:
#Since processing ended up surfacing a lot of false positives I had to manually go through the count results to figure out the top 10 books. I left in variations in spelling as well

top_book_list = ['Little Life','Wheel of Time','Count of Monte Cristo','East of Eden','Hail Mary','The secret history','Fahrenheit 451','American psycho','It Ends with Us','Anna Karenina','Blood Meridian','My Love','Lord of the rings','The Song of Achilles','Three-Body Problem','The Wheel of Time','House of Leaves','Great Gatsby','Don Quixote','Haunting of Hill House','Bell Jar','Catcher in the Rye','Midnight Library','The haunting of hill house','Brothers Karamazov','The Bell Jar','Percy jackson']

filteredBooks = booksDF.filter(booksDF.foundBook.isin(top_book_list))
filteredBooks.show()

+--------------------+--------------------+-----+-----------+--------------------+
|           postTitle|                post|score|created_utc|           foundBook|
+--------------------+--------------------+-----+-----------+--------------------+
|Thoughts on A Lit...|"TW: Self harmI j...|    1| 2022-06-30|         Little Life|
|Lord of the Rings...|"For some odd rea...|    1| 2022-06-30|   Lord of the rings|
|Reading East of E...|Finally got aroun...|    1| 2022-06-30|        East of Eden|
|Thoughts on Ameri...|I was suprised by...|    1| 2022-06-30|     American psycho|
|My problems with ...|I just finished t...|    1| 2022-06-29|        East of Eden|
|The Song of Achil...|Is Patroclus and ...|    1| 2022-06-29|The Song of Achilles|
|“The Song of Achi...|Is Patroclus and ...|    1| 2022-06-29|The Song of Achilles|
|I read A Little L...|The reviews weren...|    1| 2022-06-27|         Little Life|
|Disappointed by E...|Loved the first t...|    1| 2022-06-27|        East of Eden|
|Pro

In [0]:
def sentiment_classification(text):
    x = TextBlob(text).sentiment[0]
    if x < 0:
        return 'Negative'
    elif x == 0:
        return 'Neutral'
    else:
        return 'Positive'
# sentiment = udf(lambda x: TextBlob(x).sentiment[0])
sentiment = udf(sentiment_classification)
spark.udf.register('sentiment', sentiment)
bookSentiment = filteredBooks.withColumn('sentiment',sentiment('post').cast('string'))
# bookSentimentCount = bookSentiment.groupBy('foundBook').pivot('sentiment').count().fillna(value=0).withColumn('Total_Count', func.col('Negative')+func.col('Neutral')+func.col('Positive')).orderBy(func.desc('Total_Count'))
bookSentimentCount = bookSentiment.groupBy('foundBook', 'sentiment').count()
# bookSentimentCount.orderBy(func.desc('Total_Count')).show()
bookSentimentCount.show()
bookSentimentCount.write.option("header",True).csv("/FileStore/tables/counted_sentiment_titles_unpivot.csv")

+--------------------+---------+-----+
|           foundBook|sentiment|count|
+--------------------+---------+-----+
|      Fahrenheit 451| Positive|   29|
|     American psycho| Positive|   20|
|        East of Eden| Negative|    5|
|Haunting of Hill ...| Positive|   19|
|      Blood Meridian| Positive|   20|
|  The secret history| Positive|   28|
|   Lord of the rings| Positive|   19|
|  Three-Body Problem| Negative|    6|
|  Catcher in the Rye| Positive|   15|
|        Great Gatsby| Negative|    4|
|The Song of Achilles| Positive|   22|
|Count of Monte Cr...| Positive|   35|
|      Blood Meridian| Negative|    6|
|     House of Leaves| Positive|   15|
|         Little Life| Positive|   39|
|  Brothers Karamazov| Positive|   15|
|The haunting of h...| Positive|   17|
|         Little Life| Negative|   26|
|   Lord of the rings| Negative|    6|
|        East of Eden| Positive|   31|
+--------------------+---------+-----+
only showing top 20 rows

