In [3]:
'''
You have been given two DataFrames related to an E-commerce platform. The first contains information about the products and their categories, while the second contains information about the orders placed for these products. 

Calculate the average price and the total number of orders for each product category and display the result.

Input DataFrames:

products_df

Column Name	Data Type
product_id	Integer
category	String
price	Float
orders_df

Column Name	Data Type
order_id	Integer
product_id	Integer
quantity	Integer
Output DataFrame:

Column Name	Data Type
category	String
avg_price	Float
total_orders_count	Integer
Example
products_df

product_id	category	price
1	Apparel	25.99
2	Apparel	35.99
3	Footwear	50.00
4	Footwear	75.00
5	Apparel	20.00
orders_df

order_id	product_id	quantity
101	1	2
102	2	1
103	1	3
104	3	1
105	4	2
output

avg_price	category	total_orders_count
29.323333	Apparel	3
62.500000	Footwear	2
'''

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("price", FloatType(), True)
])

products_data = [
    (1, "Apparel", 25.99),
    (2, "Apparel", 35.99),
    (3, "Footwear", 50.00),
    (4, "Footwear", 75.00),
    (5, "Apparel", 20.00)
]

products_df = spark.createDataFrame(products_data, schema=products_schema)

orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True)
])

orders_data = [
    (101, 1, 2),
    (102, 2, 1),
    (103, 1, 3),
    (104, 3, 1),
    (105, 4, 2)
]

orders_df = spark.createDataFrame(orders_data, schema=orders_schema)

df_result = (
  products_df.join(orders_df, on = "product_id", how = "inner")
  .groupBy("category")
  .agg(
    F.count("order_id").alias("total_orders_count"),
    F.avg("price").alias("avg_price")
  )
)

# Display result
df_result.show()
    

+--------+------------------+------------------+
|category|total_orders_count|         avg_price|
+--------+------------------+------------------+
| Apparel|                 3|29.323333740234375|
|Footwear|                 2|              62.5|
+--------+------------------+------------------+

