In [1]:
import os
import pandas as pd
from pyspark.sql import DataFrame
import pyspark.sql.functions as sf
from pyspark.sql.types import *
from pyspark.ml.feature import StopWordsRemover, Tokenizer


In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://127.0.0.1:7077")
    # the number of executors this job needs
    .config("spark.executor.instances", 2)
    # the number of CPU cores memory this needs from the executor,
    # it would be reserved on the worker
    .config("spark.executor.cores", "2")
    .config("spark.executor.memory", "4G")
    .getOrCreate()
)
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/24 14:05:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# show first 5 rows of json data
data = spark.read.json("data/Pet_Supplies.json")

                                                                                

In [4]:
# data.printSchema()
data_schema = StructType([StructField("asin", StringType(), True),
                            # StructField("image", StringType(), True),
                            StructField("overall", DoubleType(), True),
                            StructField("reviewText", StringType(), True),
                            StructField("reviewTime", StringType(), True),
                            StructField("reviewerID", StringType(), True),
                            StructField("reviewerName", StringType(), True),
                            StructField("style", StructType([
                                StructField("Color Name", StringType(), True),
                                StructField("Color", StringType(), True),
                                StructField("Design", StringType(), True),
                                StructField("Flavor Name", StringType(), True),
                                StructField("Flavor", StringType(), True),
                                StructField("Format", StringType(), True),
                                StructField("Item Display Length", StringType(), True),
                                StructField("Item Display Weight", StringType(), True),
                                StructField("Item Package Quantity", StringType(), True),
                                StructField("Item Shape", StringType(), True),
                                StructField("Length", StringType(), True),
                                StructField("Material", StringType(), True),
                                StructField("Model Number", StringType(), True),
                                StructField("Number of Items", StringType(), True),
                                StructField("Package Quantity", StringType(), True),
                                StructField("Package Type", StringType(), True),
                                StructField("Pattern", StringType(), True),
                                StructField("Product Packaging", StringType(), True),
                                StructField("Scent Name", StringType(), True),
                                StructField("Scent", StringType(), True),
                                StructField("Size Name", StringType(), True),
                                StructField("Size", StringType(), True),
                                StructField("Style Name", StringType(), True),
                                StructField("Style", StringType(), True),
                                StructField("Wattage", StringType(), True)]), True),
                            StructField("summary", StringType(), True),
                            StructField("unixReviewTime", LongType(), True),
                            StructField("verified", BooleanType(), True),
                            StructField("vote", StringType(), True)])


In [5]:
# data_df = spark.read.schema(data_schema).json("data/Pet_Supplies.json")
data_df = spark.read.json("data/Pet_Supplies.json", schema=data_schema)

##### Data Info
        data_df.columns --> ['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName',
                            
                                'style', 'summary', 'unixReviewTime', 'verified', 'vote']

        Number of rows: 6,542,483; Number of columns: 11


In [141]:
data_df.show(5, truncate=False)

+----------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------+----------------+-----+-------------------------------+--------------+--------+----+
|asin      |overall|reviewText                                                                                                                                                                                                                                |reviewTime |reviewerID    |reviewerName    |style|summary                        |unixReviewTime|verified|vote|
+----------+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------

In [81]:
##### Meta Data Info
# meta_df.columns --> ['also_buy', 'also_view',  'asin',  'brand',  'category',  'date',  'description',

# 'details',  'feature',  'fit',  'imageURL',  'imageURLHighRes',  'main_cat',

# 'price',  'rank',  'similar_item',  'tech1',  'tech2',  'title']

# Number of rows: 205,999; Number of columns: 19
# meta_df.show(5, truncate=False)
# join data_df and meta_df on asin
# joined_df = data_df.join(meta_df, on=['asin'], how='left')

In [242]:
# add a distinct key to each row of data_df
add_keys = data_df.withColumn("id", sf.monotonically_increasing_id())
#check count distinct id
# data_df.select("id").distinct().count()



+----------+-------+--------------------+-----------+--------------+------------------+-----+--------------------+--------------+--------+----+---+
|      asin|overall|          reviewText| reviewTime|    reviewerID|      reviewerName|style|             summary|unixReviewTime|verified|vote| id|
+----------+-------+--------------------+-----------+--------------+------------------+-----+--------------------+--------------+--------+----+---+
|0972585419|    3.0|Either my bird ca...|09 27, 2007|A13K4OZKAAHOXS|    100indecisions| NULL|Still waiting for...|    1190851200|    true|   5|  0|
|0972585419|    4.0|The CD is a good ...|08 25, 2007|A1DWYEX4P7GB7Z|         J. Weaver| NULL|Feathered Phonics...|    1188000000|    true|   6|  1|
|0972585419|    4.0|good cd to teach ...|02 20, 2007|A3NVN97YJSKEPC|  Theresa Ehrhardt| NULL|             bird cd|    1171929600|    true|   5|  2|
|0972585419|    1.0|I was not happy w...|12 30, 2016|A1PDMES1LYA0DP|             Kathi| NULL|            One Sta

In [245]:
# create new df with 500 rows with equal distribution of overall
review_df = add_keys.select('id', 'overall', 'reviewText')





#### ADJUST TO NOT USE SUBSET DATA

In [246]:
sampled_df = review_df.sampleBy('overall', fractions={1.0: 0.2, 2.0: 0.2, 3.0: 0.2, 4.0: 0.2, 5.0: 0.2}, seed=0).limit(500)
print('Sampled_df shape: ', (sampled_df.count(), len(sampled_df.columns)))

# sampled_df = sampled_df.sort(sf.col("id").asc())
# sampled_df.show()

                                                                                

Sampled_df shape:  (500, 3)
+---+-------+--------------------+
| id|overall|          reviewText|
+---+-------+--------------------+
|  2|    4.0|good cd to teach ...|
|  9|    3.0|Bird showed no in...|
| 16|    1.0|Horrible just a w...|
| 22|    2.0|Disc works fine b...|
| 26|    5.0|Great for teachin...|
| 32|    1.0|It does not conta...|
| 35|    1.0|Don't waste your ...|
| 46|    4.0|Really like this ...|
| 57|    4.0|finish reading th...|
| 58|    5.0|This is the best ...|
| 64|    5.0|My cat literally ...|
| 66|    3.0|They came to me r...|
| 69|    5.0| My cats love it lol|
| 79|    2.0|Purchased this ca...|
| 80|    5.0|My one year old c...|
| 88|    4.0|Two of three of t...|
| 91|    5.0|I bought this vid...|
| 92|    3.0|If I had gotten j...|
|102|    5.0|Nice coat is a gr...|
|105|    5.0|Power - Mune Chic...|
+---+-------+--------------------+
only showing top 20 rows



                                                                                

In [241]:
sampled_df = review_df.sampleBy('overall', fractions={1.0: 0.2, 2.0: 0.2, 3.0: 0.2, 4.0: 0.2, 5.0: 0.2}, seed=0).limit(500)
# print(sampled_df.schema)

# remove punctuation from reviewText, convert to lowercase, split reviews into list of words
clean_txt = sampled_df.withColumn("reviewText", sf.regexp_replace(sf.col("reviewText"), "[^a-zA-Z0-9\\s]", "")) \
                        .withColumn("reviewText", sf.lower(sf.col("reviewText"))) \
                        .withColumn("splitText", sf.split(sf.col("reviewText"), " "))

# remove stop words
stop_words_remover = StopWordsRemover() \
                    .setInputCol("splitText") \
                    .setOutputCol("filteredWords") \
                    .transform(clean_txt)

filtered_df = stop_words_remover.select('id', 'overall', 'reviewText', 'filteredWords')
filtered_df = filtered_df.filter(sf.col("reviewText").isNotNull())
filtered_df.show()


                                                                                

+---+-------+--------------------+
| id|overall|          reviewText|
+---+-------+--------------------+
|  2|    4.0|good cd to teach ...|
|  9|    3.0|Bird showed no in...|
| 16|    1.0|Horrible just a w...|
| 22|    2.0|Disc works fine b...|
| 26|    5.0|Great for teachin...|
| 32|    1.0|It does not conta...|
| 35|    1.0|Don't waste your ...|
| 46|    4.0|Really like this ...|
| 57|    4.0|finish reading th...|
| 58|    5.0|This is the best ...|
| 64|    5.0|My cat literally ...|
| 66|    3.0|They came to me r...|
| 69|    5.0| My cats love it lol|
| 79|    2.0|Purchased this ca...|
| 80|    5.0|My one year old c...|
| 88|    4.0|Two of three of t...|
| 91|    5.0|I bought this vid...|
| 92|    3.0|If I had gotten j...|
|102|    5.0|Nice coat is a gr...|
|105|    5.0|Power - Mune Chic...|
+---+-------+--------------------+
only showing top 20 rows

+---+-------+--------------------+--------------------+
| id|overall|          reviewText|       filteredWords|
+---+-------+---------

In [247]:
def count_words(words):
    # remobe newline characters
    words = [word.replace('\n', '') for word in words]
    # Filter out empty strings
    words = [word for word in words if word.strip()]
    return {word: words.count(word) for word in set(words)}
# Convert DataFrame to RDD and perform mapping and reduceByKey to count words
word_counts_rdd = (
    filtered_df.select("id", "filteredWords")
    .rdd  # Convert to RDD for map-reduce operations
    .flatMap(lambda row: [(word, 1) for word in row["filteredWords"]])
    .reduceByKey(lambda a, b: a + b)
)

# convert word_counts_rdd to dict
word_counts_dict = dict(word_counts_rdd.collect())

# add a new column with word counts as a dict
def add_word_counts(row):
    word_counts = count_words(row["filteredWords"])
    return (row["id"], word_counts)

# apply function to each row in the original df
filtered_df_with_counts_rdd = filtered_df.rdd.map(lambda row: add_word_counts(row))

# convert RDD back to df
filtered_df_with_counts = filtered_df_with_counts_rdd.toDF(["id", "word_counts"])

filtered_df_with_counts.show(truncate=False)

                                                                                

+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |word_counts                                                                                                                                                                                                                    

In [248]:
# join filtered_df and filtered_df_with_counts on id selecting only id, overall, revierText, and word_counts
join_df = filtered_df.join(filtered_df_with_counts, on=['id'], how='left').select('id', 'overall', 'reviewText', 'word_counts')

In [249]:
# sort join_df by id
join_df = join_df.sort(sf.col("id").asc())

# join_df.show()

# print shape of join_df
print('Join_df shape: ', (join_df.count(), len(join_df.columns)))

Join_df shape:  (500, 4)


                                                                                

 1. identify review keywords, 
         calculate their frequencies, and
         average rating for specific products or product categories. 


 2.  analysis to determine product trends, 
         identify most popular product categories, or 
         identify relationships between customers who purchase and review the same item. 

In [None]:
# meta_df = spark.read.schema(meta_schema).json("data/meta_Pet_Supplies.json")
# meta_df = spark.read.json("data/meta_Pet_Supplies.json", schema=meta_schema)

In [None]:
# # meta_data.printSchema()

# meta_schema = StructType([StructField("also_buy", ArrayType(StringType(), True), True),
#                             StructField("also_view", ArrayType(StringType(), True), True),
#                             StructField("asin", StringType(), True),
#                             StructField("brand", StringType(), True),
#                             StructField("category", ArrayType(StringType(), True), True),
#                             StructField("date", StringType(), True),
#                             StructField("description", ArrayType(StringType(), True), True),
#                             StructField("details", StructType([
#                                 StructField("\n    Item Weight: \n    ", StringType(), True),
#                                 StructField("\n    Package Dimensions: \n    ", StringType(), True),
#                                 StructField("\n    Product Dimensions: \n    ", StringType(), True),
#                                 StructField(" Date first listed on Amazon:", StringType(), True),
#                                 StructField(" UNSPSC Code:", StringType(), True),
#                                 StructField(" ASIN:", StringType(), True),
#                                 StructField(" ASIN: ", StringType(), True),
#                                 StructField(" Audio CD", StringType(), True),
#                                 StructField(" Audio Cassette", StringType(), True),
#                                 StructField(" Batteries", StringType(), True),
#                                 StructField(" Discontinued by manufacturer:", StringType(), True),
#                                 StructField(" Domestic Shipping: ", StringType(), True),
#                                 StructField(" Hardcover:", StringType(), True),
#                                 StructField(" International Shipping: ", StringType(), True),
#                                 StructField(" Item model number:", StringType(), True),
#                                 StructField(" Label:", StringType(), True),
#                                 StructField(" Language:", StringType(), True),
#                                 StructField(" Number of Discs:", StringType(), True),
#                                 StructField(" Publisher:", StringType(), True),
#                                 StructField(" Shipping Advisory:", StringType(), True),
#                                 StructField(" Shipping Weight:", StringType(), True),
#                                 StructField(" Subtitles:", StringType(), True),
#                                 StructField(" UPC:", StringType(), True),]), True),
#                             StructField("feature", ArrayType(StringType(), True), True),
#                             StructField("fit", StringType(), True),
#                             StructField("imageURL", ArrayType(StringType(), True), True),
#                             StructField("imageURLHighRes", ArrayType(StringType(), True), True),
#                             StructField("main_cat", StringType(), True),
#                             StructField("price", StringType(), True),
#                             StructField("rank", StringType(), True),
#                             StructField("similar_item", StringType(), True),
#                             StructField("tech1", StringType(), True),
#                             StructField("tech2", StringType(), True),
#                             StructField("title", StringType(), True)])