In [18]:
# Import necessary libraries and initialize a SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as f

In [19]:
# Initialize a Spark session
spark = SparkSession.builder.appName("Question_1").getOrCreate()

In [20]:
orders_df = spark.read.csv("Data/olist_orders_dataset.csv", header=True, inferSchema=True)


In [21]:
orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [22]:
delivered_orders_df = orders_df.filter(orders_df["Order_status"] == "delivered")

In [23]:
# Extract the hour of the day from the timestamp
delivered_orders_df = delivered_orders_df.withColumn("Order_purchase_hour", f.hour("Order_purchase_timestamp"))
delivered_orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|Order_purchase_hour|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|                 10|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27

In [24]:
# Create a column for the time slot 
delivered_orders_df = delivered_orders_df.withColumn("Order_purchase_time_slot",
    f.when((delivered_orders_df["Order_purchase_hour"] >= 0) & (delivered_orders_df["Order_purchase_hour"] <= 6), "Dawn")
    .when((delivered_orders_df["Order_purchase_hour"] >= 7) & (delivered_orders_df["Order_purchase_hour"] <= 12), "Morning")
    .when((delivered_orders_df["Order_purchase_hour"] >= 13) & (delivered_orders_df["Order_purchase_hour"] <= 18), "Afternoon")
    .otherwise("Night")
)
delivered_orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|Order_purchase_hour|Order_purchase_time_slot|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|                 10|                 Morning|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|

In [25]:
# Use window function to partition by order_purchase_timestamp
window_spec = Window.partitionBy("Order_purchase_timestamp")

In [26]:
delivered_orders_df = delivered_orders_df.withColumn("Orders_within_timestamp", f.count("Order_purchase_timestamp").over(window_spec))

In [27]:
delivered_orders_df.show()



+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+-----------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|Order_purchase_hour|Order_purchase_time_slot|Orders_within_timestamp|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+-----------------------+
|c3d9e402b6a0fbe2a...|5720a15d022c09d26...|   delivered|     2016-10-04 10:16:04|2016-10-04 10:45:33|         2016-10-28 15:34:00|          2016-11-08 10:41:54|          2016-12-08 00:00:00|                 10|          

                                                                                

In [28]:
# Define a window specification for calculating window aggregates
window_spec = Window.partitionBy("customer_id")

# Calculate the total number of orders in each time slot for each customer
customer_time_slot_counts = joined_df.orderBy("customer_id", "Order_purchase_time_slot") 

# Show the result
customer_time_slot_counts.select("customer_id", "Order_purchase_time_slot").show()

NameError: name 'joined_df' is not defined

In [None]:
# Define a window specification for calculating window aggregates
window_spec = Window.partitionBy("customer_id")

# Calculate the total number of orders in each time slot
time_slot_counts = orders_df.groupBy("Order_purchase_time_slot").agg(f.count("*").alias("TotalOrder"))



# Show the result
time_slot_counts.show()

+------------------------+----------+
|Order_purchase_time_slot|TotalOrder|
+------------------------+----------+
|                    Dawn|      5242|
|                 Morning|     27733|
|               Afternoon|     38135|
|                   Night|     28331|
+------------------------+----------+



In [None]:
from pyspark.sql.functions import unix_timestamp

delivered_orders_df = delivered_orders_df.withColumn(
    "delivery_time_days",
    (unix_timestamp("order_delivered_customer_date") - unix_timestamp("order_purchase_timestamp")) / (24 * 3600)
)
delivered_orders_df.show()

#Average delivery time 
average_delivery_time = delivered_orders_df.selectExpr("avg(delivery_time_days) as avg_delivery_time").first()["avg_delivery_time"]
print("Average Delivery Time (in days):", average_delivery_time)


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|Order_purchase_hour|Order_purchase_time_slot|delivery_time_days|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+------------------------+------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|                 10|                 Morning|