In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Amazon sales")
         .getOrCreate())

In [2]:
file_path = "datasets/amazon.csv"

In [3]:
amz_sales_data = spark.read.option("header", "true").csv(file_path)

In [4]:
amz_sales_data.show(2)

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|       about_product|             user_id|           user_name|           review_id|        review_title|      review_content|            img_link|        product_link|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|Computers&Accesso...|            ₹399|      ₹1,099|                64%|   4.2|      24,2

In [5]:
amz_sales_data.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- discounted_price: string (nullable = true)
 |-- actual_price: string (nullable = true)
 |-- discount_percentage: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- rating_count: string (nullable = true)
 |-- about_product: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_content: string (nullable = true)
 |-- img_link: string (nullable = true)
 |-- product_link: string (nullable = true)



In [6]:
amz_sales_data.count()

1465

In [7]:
check_duplicates = (amz_sales_data
                  .groupBy("product_id", "product_name", "category", "discounted_price", "actual_price",
                           "discount_percentage", "rating", "rating_count", "about_product", "user_id",
                           "user_name", "review_id", "review_title", "review_content", "img_link", "product_link")
                  .count()
                  .filter("count > 1")
                  .orderBy("count", assending=False))

In [8]:
check_duplicates.show()

+----------+--------------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|product_id|        product_name|            category|discounted_price|actual_price| discount_percentage|              rating|        rating_count|       about_product|             user_id|           user_name|           review_id|        review_title|      review_content|            img_link|        product_link|count|
+----------+--------------------+--------------------+----------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|B09NVPSCQT|"Noise ColorFit P...| 

In [9]:
amz_sales_cdata = amz_sales_data.dropDuplicates(["product_id", "product_name", "category", "discounted_price", "actual_price",
                           "discount_percentage", "rating", "rating_count", "about_product", "user_id",
                           "user_name", "review_id", "review_title", "review_content", "img_link", "product_link"])
print("Count before removing duplicates: "+ str(amz_sales_data.count()))
print("Count after removing duplicates: "+ str(amz_sales_cdata.count()))

Count before removing duplicates: 1465
Count after removing duplicates: 1459


In [10]:
amz_sales_cdata.dropna()

DataFrame[product_id: string, product_name: string, category: string, discounted_price: string, actual_price: string, discount_percentage: string, rating: string, rating_count: string, about_product: string, user_id: string, user_name: string, review_id: string, review_title: string, review_content: string, img_link: string, product_link: string]

In [11]:
from pyspark.sql.functions import col, regexp_replace, sum, count, max, avg
from pyspark.sql.types import IntegerType
# Remove non-numeric characters using regular expression
amz_sales_cdata = (amz_sales_cdata.withColumn("rating_count", regexp_replace(col("rating_count"), "[^0-9.]", ""))
                                .withColumn("discounted_price", regexp_replace(col("discounted_price"), "[^0-9.]", ""))
                                .withColumn("actual_price", regexp_replace(col("actual_price"), "[^0-9.]", ""))
                                .withColumn("discount_percentage", regexp_replace(col("discount_percentage"), "[^0-9.]", ""))
                                .withColumn("rating", regexp_replace(col("rating"), "[^0-9.]", "")))


# Convert the column to Double & IntegerType
amz_sales_cdata = (amz_sales_cdata.withColumn("discounted_price", col("discounted_price").cast("double"))
                                .withColumn("actual_price", col("actual_price").cast("double"))
                                .withColumn("discount_percentage", col("discount_percentage").cast("double"))
                                .withColumn("rating", col("rating").cast("double"))
                                .withColumn("rating_count", col("rating_count").cast("integer")))
                                
# Handling missing values 
amz_sales_cdata = amz_sales_cdata.fillna(value=0, subset=["discounted_price", "actual_price",
                           "discount_percentage", "rating", "rating_count"])


In [12]:
from pyspark.sql.functions import col, mean, stddev, count, desc, format_number, corr, length, rand
from pyspark.sql import functions as F

In [13]:
#Actual price statistics
print("Actual price statistics:")
actual_price_stats = amz_sales_cdata.select(
    format_number(mean('actual_price'),2).alias('mean_actual_price'),
    format_number(F.expr('percentile_approx(actual_price, 0.5)'),2).alias('median_actual_price'),
    format_number(stddev('actual_price'),2).alias('stddev_actual_price'))
actual_price_stats.show()

#Discounted price statistics 
print("Discounted price statistics:")
discounted_price_stats = amz_sales_cdata.select(
    format_number(mean('discounted_price'),2).alias('mean_discounted_price'),
    format_number(F.expr('percentile_approx(discounted_price, 0.5)'),2).alias('median_discounted_price'),
    format_number(stddev('discounted_price'),2).alias('stddev_discounted_price'))
discounted_price_stats.show()

#Discount percentage statistics
print("Discount percentage statistics:")
discount_percentage_stats = amz_sales_cdata.select(
    format_number(mean('discount_percentage'),2).alias('mean_discount_percentage'),
    format_number(F.expr('percentile_approx(discount_percentage, 0.5)'),2).alias('median_discount_percentage'),
    format_number(stddev('discount_percentage'),2).alias('stddev_discount_percentage'))
discount_percentage_stats.show()

#Rating statistics
print("Rating statistics:")
rating_stats = amz_sales_cdata.select(
    format_number(mean('rating'),2).alias('mean_rating'),
    format_number(F.expr('percentile_approx(rating, 0.5)'),2).alias('median_rating'),
    format_number(stddev('rating'),2).alias('stddev_rating'))
rating_stats.show()

#Rating count statistics
print("Rating count statistics:")
rating_count_stats = amz_sales_cdata.select(
    format_number(mean('rating_count'),2).alias('mean_rating_count'),
    format_number(F.expr('percentile_approx(rating_count, 0.5)'),2).alias('median_rating_count'),
    format_number(stddev('rating_count'),2).alias('stddev_rating_count'))
rating_count_stats.show()

Actual price statistics:
+-----------------+-------------------+-------------------+
|mean_actual_price|median_actual_price|stddev_actual_price|
+-----------------+-------------------+-------------------+
|         5,961.18|           1,499.00|          17,488.41|
+-----------------+-------------------+-------------------+

Discounted price statistics:
+---------------------+-----------------------+-----------------------+
|mean_discounted_price|median_discounted_price|stddev_discounted_price|
+---------------------+-----------------------+-----------------------+
|             9,648.33|                 699.00|             125,691.38|
+---------------------+-----------------------+-----------------------+

Discount percentage statistics:
+------------------------+--------------------------+--------------------------+
|mean_discount_percentage|median_discount_percentage|stddev_discount_percentage|
+------------------------+--------------------------+--------------------------+
|        

#### Distribution of products under each catagory

In [14]:
category_count = amz_sales_cdata.select("category").distinct().count()
product_count = amz_sales_cdata.select("product_id").distinct().count()
print("Category count:",category_count)
print("Product count:",product_count)

category_distribution = (amz_sales_cdata.groupBy("category")
                                        .agg(count("product_id").alias("product_count"))
                                        .orderBy(desc("product_count")))
category_distribution.show(truncate=False)

Category count: 243
Product count: 1351
+-----------------------------------------------------------------------------------------------------+-------------+
|category                                                                                             |product_count|
+-----------------------------------------------------------------------------------------------------+-------------+
|Computers&Accessories|Accessories&Peripherals|Cables&Accessories|Cables|USBCables                    |233          |
|Electronics|Mobiles&Accessories|Smartphones&BasicMobiles|Smartphones                                 |68           |
|Electronics|HomeTheater,TV&Video|Televisions|SmartTelevisions                                        |63           |
|Electronics|Headphones,Earbuds&Accessories|Headphones|In-Ear                                         |52           |
|Electronics|HomeTheater,TV&Video|Accessories|RemoteControls                                          |49           |
|Electronics|Wea

In [15]:
highest_discounted_price = amz_sales_cdata.select("product_name", "discounted_price").orderBy(col("discounted_price").desc()).first()
lowest_discounted_price = amz_sales_cdata.select("product_name", "discounted_price").orderBy(col("discounted_price").asc()).first()


print("Product with Highest Discounted Price:-")
print(f"Product Name: {highest_discounted_price.product_name}")
print(f"Discounted Price: {highest_discounted_price.discounted_price}")
print("------------------------------------------------------------------------------------------------------------------")
print("Product with lowest Discounted Price:-")
print(f"Product Name: {lowest_discounted_price.product_name}")
print(f"Discounted Price: {lowest_discounted_price.discounted_price}")


Product with Highest Discounted Price:-
Product Name: Fire-Boltt Phoenix Smart Watch with Bluetooth Calling 1.3"
Discounted Price: 2402402.0
------------------------------------------------------------------------------------------------------------------
Product with lowest Discounted Price:-
Product Name: "3M Post-it Sticky Note Cube, 200 Sheets (4 Colors x 50 Sheets) | 3"" x 3"" Size | For notes
Discounted Price: 0.0


In [16]:

avg_discount_pct = (amz_sales_cdata.groupBy("category")
                                    .agg(avg("discount_percentage").alias("avg_discount_pct"))
                                    .orderBy(col("avg_discount_pct").desc()))
avg_discount_pct.show()                    

+--------------------+----------------+
|            category|avg_discount_pct|
+--------------------+----------------+
|         81X800LGIN"|         59890.0|
|                  PC|          6490.0|
| IPX7 & TWS Featu...|          2490.0|
|              Black"|          1499.0|
|       Sp02 Tracking|          1499.0|
|            Dial Pad|          1002.0|
| Fully-Functional...|           500.0|
| Erasable Reusabl...|           354.0|
| Voice Assistant ...|           247.0|
| Bluetooth Callin...|           150.0|
|Electronics|Headp...|            90.0|
|Computers&Accesso...|            90.0|
|Electronics|Mobil...|            90.0|
|Electronics|Headp...|            88.0|
|Computers&Accesso...|            87.5|
|Electronics|Mobil...|            82.0|
|Computers&Accesso...|            80.0|
|Computers&Accesso...|            78.5|
|Computers&Accesso...|            78.0|
|Electronics|Mobil...|            76.4|
+--------------------+----------------+
only showing top 20 rows



#### correlation btw 'discounted_price' and 'rating'

In [17]:
corr_dsprice_rating = amz_sales_cdata.stat.corr("discounted_price", "rating")
corr_dsprice_price = amz_sales_cdata.stat.corr("actual_price", "rating")
print(corr_dsprice_rating, "&", corr_dsprice_price)

-0.0008382036764974348 & -0.008129490769260716


In [18]:
t10users_highest_reviews = (amz_sales_cdata.groupBy("user_id", "user_name")
                                        .agg(count("user_id").alias("no_of_reviews"))
                                        .orderBy(col("no_of_reviews").desc())
                                        .limit(10)
                        )
t10users_highest_reviews.show()

+--------------------+--------------------+-------------+
|             user_id|           user_name|no_of_reviews|
+--------------------+--------------------+-------------+
|AHIKJUDTVJ4T6DV6I...|$@|\|TO$|-|,Sethu...|           10|
|AGAELRYPMTG5SADZP...|Satheesh Kadiam,P...|            8|
|AG3D6O4STAQKAY2UV...|Manav,Adarsh gupt...|            8|
|AECPFYFQVRUWC3KGN...|ArdKn,Nirbhay kum...|            7|
|AEWAZDZZJLQUYVOVG...|Omkar dhale,JD,HE...|            7|
|AG44HJB2AMIVHAGQZ...|Prashant,Sumesh S...|            6|
|AHWRZWPCTG6ICA7WT...|siddharth patnaik...|            6|
|AFSMISGEYDYIP3Z42...|Ayush,ROHIT A.,Ke...|            5|
|                 4.2|              13,937|            5|
|AHPYDFW6Y3FIQGD2R...|AV,Mathan kumar,A...|            5|
+--------------------+--------------------+-------------+



In [19]:
amz_sales_cdata.groupBy("user_id").agg(mean("rating").alias("avg_rating"),count("review_id").alias("review_count")).show()

+--------------------+------------------+------------+
|             user_id|        avg_rating|review_count|
+--------------------+------------------+------------+
|AGHT3K4KSG5MAQUSX...|               2.0|           1|
|AE35OI7LDTOKU32IF...|               3.9|           1|
|AHFT3PEI64SYXMAXB...|               3.7|           1|
|AGOQZTWW4TWCEF63H...|               4.0|           1|
|AEKLQGYWRYPMVY7BP...|               4.3|           1|
|AF7KVNWBD7JWYLKGK...|               3.6|           1|
|AEAJ3Z2IULDDDQC7K...|               4.1|           1|
|AHXSYSLVVATNHR4SW...|               4.1|           1|
|AF2JQCLSCY3QJATWU...|               4.0|           1|
|AEITVIFC7WZAEQDIV...|               4.2|           3|
|AFBHLRTSYYAZ2IGMV...|               4.1|           1|
|AHS4CWP5EVS55YZCJ...|               3.8|           1|
|AHCZZTKJ5WN7WJSQU...|               3.5|           1|
| effortlessly.|Wa...|              4.15|           4|
|AG2REE6BFNII6CHJQ...|               4.3|           1|
|AHWRZWPCT

In [20]:
ratings = amz_sales_cdata.groupBy("rating").agg(count("rating").alias("count")).orderBy(col("count").desc()).limit(5)
popular_rating = ratings.orderBy(desc("count")).first()

print(f"Most popular rating: {popular_rating.rating}")
ratings.show()

Most popular rating: 4.1
+------+-----+
|rating|count|
+------+-----+
|   4.1|  233|
|   4.2|  219|
|   4.3|  217|
|   4.0|  170|
|   4.4|  118|
+------+-----+



In [21]:
print(f"TOP 10 products with high ratings: ")
highest_product_rating = (amz_sales_cdata
                          .groupBy("product_id", "product_name")
                          .agg(max("rating").alias("rating"))
                          .orderBy(col("rating").desc())
                          .limit(10)
                          .show()
                         )


TOP 10 products with high ratings: 
+----------+--------------------+---------+
|product_id|        product_name|   rating|
+----------+--------------------+---------+
|B088Z1YWBC|"EGate i9 Pro-Max...|3310432.0|
|B09YV463SW|"Fire-Boltt Ninja...|   9999.0|
|B0BLV1GNLN|"WZATCO Pixel | P...|   9990.0|
|B09YV42QHZ|"Fire-Boltt Ninja...|   7999.0|
|B0B3N7LR6K|"Fire-Boltt Visio...|   3999.0|
|B0B3NDPCS9|"Fire-Boltt Visio...|   3999.0|
|B09NC2TY11|"Noise ColorFit U...|   2499.0|
|B0B3RSDSZ3|Fire-Boltt Phoeni...|   1999.0|
|B0B3RS9DNF|Fire-Boltt Phoeni...|   1999.0|
|B0B3RRWSF6|Fire-Boltt Phoeni...|   1998.0|
+----------+--------------------+---------+



In [22]:
famous_product = amz_sales_cdata.select("product_name", "rating_count").orderBy(col("rating_count").desc()).first()
print(f" Most famous product: {famous_product.product_name}")

 Most famous product: Amazon Basics High-Speed HDMI Cable, 6 Feet (2-Pack),Black


In [23]:
famous_products = (amz_sales_cdata.select("product_name", "rating_count").orderBy(col("rating_count").desc()))
famous_products.show()

+--------------------+------------+
|        product_name|rating_count|
+--------------------+------------+
|Amazon Basics Hig...|      426973|
|AmazonBasics Flex...|      426973|
|Amazon Basics Hig...|      426973|
|AmazonBasics Flex...|      426972|
|boAt Bassheads 10...|      363713|
|boAt Bassheads 10...|      363713|
|boAt BassHeads 10...|      363711|
|Redmi 9A Sport (C...|      313836|
|Redmi 9 Activ (Ca...|      313836|
|Redmi 9A Sport (C...|      313832|
|Redmi 9A Sport (C...|      313832|
|boAt Bassheads 22...|      273189|
|Pigeon Polypropyl...|      270563|
|SanDisk Cruzer Bl...|      253105|
|SanDisk Extreme S...|      205052|
|JBL C100SI Wired ...|      192590|
|JBL C100SI Wired ...|      192589|
|JBL C100SI Wired ...|      192587|
|SanDisk Ultra Dua...|      189104|
|boAt Airdopes 121...|      180998|
+--------------------+------------+
only showing top 20 rows



In [24]:
link_analysis = amz_sales_cdata.select('product_link', 'rating') \
    .withColumn('link_length', length('product_link'))

In [25]:
average_rating_by_link_length = (link_analysis.groupBy('link_length') 
                                                .agg(mean('rating').alias('avg_rating')) 
                                                .orderBy('link_length'))
average_rating_by_link_length.show()

+-----------+------------------+
|link_length|        avg_rating|
+-----------+------------------+
|          5| 4.033333333333333|
|          8|               4.4|
|          9|3.9499999999999997|
|         10|               4.3|
|         11|               3.9|
|         13|              3.15|
|         16| 4.233333333333333|
|         17|               4.0|
|         18|               4.8|
|         19|            1998.5|
|         20|               4.2|
|         24|               3.5|
|         25|               3.8|
|         26|               4.1|
|         27|               4.5|
|         28|             687.0|
|         30|               4.0|
|         33|               3.8|
|         35|               4.0|
|         41|               4.1|
+-----------+------------------+
only showing top 20 rows



In [26]:
rating_seg = (amz_sales_cdata.withColumn("rating_type", 
                                         F.when(F.col("rating") >= 4, "positive")
                                         .when((col("rating") >= 3) & (col("rating") < 4), "neutral")
                                         .otherwise("negative"))
             )
rating_seg.show()


+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+-----------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|       about_product|             user_id|           user_name|           review_id|        review_title|       review_content|            img_link|        product_link|rating_type|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+-----------+
|B09RX1FK54|boAt Type C A750 ...|Computers&Accesso...|           399.0|       999.

In [27]:
rating_seg.groupBy("rating_type").count().show()

+-----------+-----+
|rating_type|count|
+-----------+-----+
|   positive| 1094|
|    neutral|  337|
|   negative|   28|
+-----------+-----+



In [28]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer, NGram, VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from datetime import datetime

In [29]:
def build_trigrams(inputCol=["review_content","rating_type"], n=3):
    
    tokenizer = [Tokenizer(inputCol="review_content", outputCol="words")]
    
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="rawFeatures"
    )]
    
    label_stringIdx = [StringIndexer(inputCol = "rating_type", outputCol = "label")]
    
    selector = [ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")]
    
    lr = [LogisticRegression()]
    
    return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)


In [36]:
(train_set, test_set) = rating_seg.randomSplit([0.80, 0.20], seed = 2000)

In [37]:
print(f"total dataset: {rating_seg.count()}")
print(f"Training dataset: {train_set.count()}")
print(f"Test dataset: {test_set.count()}")

total dataset: 1459
Training dataset: 1148
Test dataset: 311


In [38]:
%%time
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
st = datetime.utcnow()
pipelineFit = build_trigrams().fit(train_set)
print('Training time:', datetime.utcnow() - st)
predictions = pipelineFit.transform(test_set)
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Training time: 0:00:15.601509
Accuracy: 0.6655948553054662
Precision: 0.7411423307757675
Recall: 0.6655948553054662
CPU times: user 72.2 ms, sys: 19.9 ms, total: 92.1 ms
Wall time: 16.9 s
