In [None]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9cccc116e48a36de22c6288741e7c09f0cceb7b5bf741b41a0413e4d02826c12
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, col

In [None]:

# Create a SparkSession
spark = SparkSession.builder.appName("SalesDataAnalysis").getOrCreate()


In [None]:
# Load the sales data into a DataFrame
sales_data = spark.read.csv("apple_products.csv", header=True, inferSchema=True)
sales_data.show()

+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|index|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|    0|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|    1|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|    2|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|    3|APPLE iPh

In [None]:
# Handle missing values by dropping rows with any missing values
cleaned_sales_data = sales_data.na.drop()
sales_data.show()

+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|index|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|    0|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|    1|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|    2|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|    3|APPLE iPh

In [None]:
# Remove duplicate entries
cleaned_sales_data = cleaned_sales_data.dropDuplicates()
sales_data.show()

+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|index|        Product Name|         Product URL|Brand|Sale Price|   Mrp|Discount Percentage|Number Of Ratings|Number Of Reviews|             Upc|Star Rating| Ram|
+-----+--------------------+--------------------+-----+----------+------+-------------------+-----------------+-----------------+----------------+-----------+----+
|    0|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     49900| 49900|                  0|             3431|              356|MOBEXRGV7EHHTGUH|        4.6|2 GB|
|    1|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVAC6TJT4F|        4.6|2 GB|
|    2|APPLE iPhone 8 Pl...|https://www.flipk...|Apple|     84900| 84900|                  0|             3431|              356|MOBEXRGVGETABXWZ|        4.6|2 GB|
|    3|APPLE iPh

In [None]:
# Calculate the total sales amount for each product
total_sales_by_product = cleaned_sales_data.groupBy("Product Name").agg(sum("Sale Price").alias("total_sales"))


In [None]:
total_sales_by_product.show()

+--------------------+-----------+
|        Product Name|total_sales|
+--------------------+-----------+
|APPLE iPhone 11 (...|      46999|
|APPLE iPhone 8 (S...|      77000|
|APPLE iPhone 12 P...|     120900|
|APPLE iPhone 12 (...|      70900|
|APPLE iPhone 12 (...|      70900|
|Apple iPhone XR (...|      41999|
|APPLE iPhone 12 M...|      64900|
|APPLE iPhone 12 P...|     130900|
|APPLE iPhone 12 P...|     120900|
|APPLE iPhone 11 P...|     117100|
|APPLE iPhone SE (...|      34999|
|Apple iPhone SE (...|      44999|
|APPLE iPhone 12 P...|     130900|
|APPLE iPhone XS M...|      89900|
|APPLE iPhone 12 M...|      59900|
|APPLE iPhone 11 P...|     117900|
|APPLE iPhone SE (...|      34999|
|APPLE iPhone 12 P...|     110900|
|APPLE iPhone 12 P...|     120900|
|APPLE iPhone 8 (S...|      77000|
+--------------------+-----------+
only showing top 20 rows



In [None]:
total_sales_by_product.write.option("header", "true").csv("filterproducts.csv")

In [None]:
# Stop the SparkSession
spark.stop()