In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime

In [2]:
spark = SparkSession.builder \
    .appName("E-commerce analysis") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

25/04/09 23:15:30 WARN Utils: Your hostname, ettore1012-IdeaPad-Gaming-3-15ARH05 resolves to a loopback address: 127.0.1.1; using 192.168.15.113 instead (on interface wlp4s0)
25/04/09 23:15:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ettore1012/Project/ecommerce_data_pipeline-main/venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ettore1012/.ivy2/cache
The jars for the packages stored in: /home/ettore1012/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-37719e82-6979-4941-b052-0d24311134df;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 193ms :: artifacts dl 7ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnl

In [3]:
# Ignoring some Spark's log
spark.sparkContext.setLogLevel("ERROR")

In [4]:
# Retrive current date, timestamps, week 
today = datetime.now()
timestamp = today.strftime("%Y-%m-%d")
year, week_num, _ = today.isocalendar() # ISO standard week number
partition_folder = f"week_{year}_{week_num}"

In [5]:
# load data on spark
try:
    users_df = spark.read.json(f"s3a://raw-data/users/{partition_folder}/users.json", multiLine = True)
except Exception as e:
    print(f"⚠️ Could not load users: {e}")

try:
    products_df = spark.read.json(f"s3a://raw-data/products/{partition_folder}/products.json", multiLine = True)
except Exception as e:
    print(f"⚠️ Could not load users: {e}")

try:
    carts_df = spark.read.json(f"s3a://raw-data/carts_{timestamp}.json", multiLine = True)
except Exception as e:
    print(f"⚠️ Could not load users: {e}")

                                                                                

In [6]:
# check schema of data
print("\n\nUsers schema\n")
users_df.printSchema()
users_df.show(5)



Users schema

root
 |-- __v: long (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- geolocation: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- long: string (nullable = true)
 |    |-- number: long (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zipcode: string (nullable = true)
 |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- password: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- username: string (nullable = true)

+---+--------------------+------------------+---+-----------------+---------+--------------+---------+
|__v|             address|             email| id|             name| password|         phone| username|
+---+--------------------+------------------+---+-----------------+---------+--

In [7]:
# Explode users_df
users_df_flat = users_df.select(
    col("__V"),
    col("address.city"),
    col("address.geolocation.lat"),
    col("address.geolocation.long"),
    col("address.number"),
    col("address.street"),
    col("address.zipcode"),
    col("email"),
    col("id"),
    col("name.firstname"),
    col("name.lastname"),
    col("password"),
    col("phone"),
    col("username")
)

users_df_flat.show(5)

+---+-----------+--------+--------+------+----------------+----------+------------------+---+---------+--------+---------+--------------+---------+
|__V|       city|     lat|    long|number|          street|   zipcode|             email| id|firstname|lastname| password|         phone| username|
+---+-----------+--------+--------+------+----------------+----------+------------------+---+---------+--------+---------+--------------+---------+
|  0|   kilcoole|-37.3159| 81.1496|  7682|        new road|12926-3874|    john@gmail.com|  1|     john|     doe|  m38rmF$|1-570-236-7033|    johnd|
|  0|   kilcoole|-37.3159| 81.1496|  7267|       Lovers Ln|12926-3874|morrison@gmail.com|  2|    david|morrison|   83r5^_|1-570-236-7033| mor_2314|
|  0|    Cullman| 40.3467|-30.1310|    86|      Frances Ct|29567-1452|   kevin@gmail.com|  3|    kevin|    ryan|kev02937@|1-567-094-1345|kevinryan|
|  0|San Antonio| 50.3467|-20.1310|  6454|Hunters Creek Dr|98234-1734|     don@gmail.com|  4|      don|   romer|

In [8]:
# check schema of data
print("\n\nProducts schema\n")
products_df.printSchema()
products_df.show(5)



Products schema

root
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)
 |-- id: long (nullable = true)
 |-- image: string (nullable = true)
 |-- price: double (nullable = true)
 |-- rating: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rate: double (nullable = true)
 |-- title: string (nullable = true)

+--------------+--------------------+---+--------------------+------+----------+--------------------+
|      category|         description| id|               image| price|    rating|               title|
+--------------+--------------------+---+--------------------+------+----------+--------------------+
|men's clothing|Your perfect pack...|  1|https://fakestore...|109.95|{120, 3.9}|Fjallraven - Fold...|
|men's clothing|Slim-fitting styl...|  2|https://fakestore...|  22.3|{259, 4.1}|Mens Casual Premi...|
|men's clothing|great outerwear j...|  3|https://fakestore...| 55.99|{500, 4.7}|  Mens Cotton Jacket|
|men's clothing|Th

In [9]:
products_df_flat = products_df.select(
    col("category"),
    col("description"),
    col("id"),
    col("image"),
    col("price"),
    col("rating.count"),
    col("rating.rate"),
    col("title")
)

products_df_flat.show(5)

+--------------+--------------------+---+--------------------+------+-----+----+--------------------+
|      category|         description| id|               image| price|count|rate|               title|
+--------------+--------------------+---+--------------------+------+-----+----+--------------------+
|men's clothing|Your perfect pack...|  1|https://fakestore...|109.95|  120| 3.9|Fjallraven - Fold...|
|men's clothing|Slim-fitting styl...|  2|https://fakestore...|  22.3|  259| 4.1|Mens Casual Premi...|
|men's clothing|great outerwear j...|  3|https://fakestore...| 55.99|  500| 4.7|  Mens Cotton Jacket|
|men's clothing|The color could b...|  4|https://fakestore...| 15.99|  430| 2.1|Mens Casual Slim Fit|
|      jewelery|From our Legends ...|  5|https://fakestore...| 695.0|  400| 4.6|John Hardy Women'...|
+--------------+--------------------+---+--------------------+------+-----+----+--------------------+
only showing top 5 rows



In [10]:
# check schema of data
print("\n\nCarts schema\n")
carts_df.printSchema()
carts_df.show(5)



Carts schema

root
 |-- __v: long (nullable = true)
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- productId: long (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- userId: long (nullable = true)

+---+--------------------+---+--------------------+------+
|__v|                date| id|            products|userId|
+---+--------------------+---+--------------------+------+
|  0|2020-03-02T00:00:...|  1|[{1, 4}, {2, 1}, ...|     1|
|  0|2020-01-02T00:00:...|  2|[{2, 4}, {1, 10},...|     1|
|  0|2020-03-01T00:00:...|  3|    [{1, 2}, {9, 1}]|     2|
|  0|2020-01-01T00:00:...|  4|            [{1, 4}]|     3|
|  0|2020-03-01T00:00:...|  5|    [{7, 1}, {8, 1}]|     3|
+---+--------------------+---+--------------------+------+
only showing top 5 rows



In [11]:
carts_df_exploded = carts_df.withColumn("product", explode(col("products")))
carts_df_flat = carts_df_exploded.select(
    "__V",
    "date",
    "id",
    col("product.productID").alias("product_ID"),
    col("product.quantity").alias("quantity")
)
carts_df_flat.show(5)

+---+--------------------+---+----------+--------+
|__V|                date| id|product_ID|quantity|
+---+--------------------+---+----------+--------+
|  0|2020-03-02T00:00:...|  1|         1|       4|
|  0|2020-03-02T00:00:...|  1|         2|       1|
|  0|2020-03-02T00:00:...|  1|         3|       6|
|  0|2020-01-02T00:00:...|  2|         2|       4|
|  0|2020-01-02T00:00:...|  2|         1|      10|
+---+--------------------+---+----------+--------+
only showing top 5 rows



In [12]:
# Clarify some column names
products_df_flat = products_df_flat.withColumnRenamed("id", "Product_ID")
users_df_flat = users_df_flat.withColumnRenamed("id", "User_ID")
carts_df_flat = carts_df_flat.withColumnRenamed("id", "User_ID")

In [16]:
# Build enriched carts facts data sets with static data from users and products
enriched_df = carts_df_flat.join(
    products_df_flat,
    products_df_flat.Product_ID == carts_df_flat.product_ID,
    "left_outer"
)

enriched_df = enriched_df.join(
    users_df_flat,
    users_df_flat.User_ID == enriched_df.User_ID,
    "left_outer" 
)

enriched_df.show()

+---+--------------------+-------+----------+--------+----------------+--------------------+----------+--------------------+------+-----+----+--------------------+---+-----------+--------+--------+------+----------------+----------+------------------+-------+---------+--------+---------+--------------+---------+
|__V|                date|User_ID|product_ID|quantity|        category|         description|Product_ID|               image| price|count|rate|               title|__V|       city|     lat|    long|number|          street|   zipcode|             email|User_ID|firstname|lastname| password|         phone| username|
+---+--------------------+-------+----------+--------+----------------+--------------------+----------+--------------------+------+-----+----+--------------------+---+-----------+--------+--------+------+----------------+----------+------------------+-------+---------+--------+---------+--------------+---------+
|  0|2020-03-02T00:00:...|      1|         1|       4|  me