In [3]:
'''
You are given a dataset containing transaction records with columns (category, sub_category, amount).

Write a PySpark query to group the data by category and sub_category while aggregating amount into a set (no duplicates). Order the amount set in ascending order.

The result should contain unique (category, sub_category) pairs with amount values in string form separated by , .

Note - This question was modified on 10 Aug 2025 due to an ambiguity in the problem statement

Starter Code
Below is the starter code to create the input DataFrame:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Define schema
schema = StructType([
    StructField("category", StringType(), True),
    StructField("sub_category", StringType(), True),
    StructField("amount", IntegerType(), True)
])

# Sample data
data = [
    ("Electronics", "Laptop", 1000),
    ("Electronics", "Laptop", 1200),
    ("Furniture", "Chair", 200),
    ("Furniture", "Chair", 150),
    ("Furniture", "Chair", 180),
    ("Furniture", "Chair", 200),
    ("Electronics","iPhone", 600),
    ("Electronics","iPhone", 400),
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show()
Sample Output Schema and Example Table
Use display(df_result) to show the final DataFrame.

Output Schema:

Column	Type
category	String
sub_category	String
amount	String
Example Output (First 2 rows):

category	sub_category	amount
Electronics	Laptop	1000, 1200
Furniture	Chair	150, 180, 200
Explanation
The dataset is grouped by category and sub_category.
The amount values for each (category, sub_category) pair are aggregated into a sorted set.
The final result contains unique (category, sub_category) pairs with all corresponding amount values as string separated by , .
'''

'''

Bonus Challenge: Can you solve this using Spark SQL and temporary views?

# Register DataFrame as temp view
df.createOrReplaceTempView("transactions")

sql_query = """
SELECT
    category,
    sub_category,
    concat_ws(
        ', ',
        sort_array(
            array_distinct(
                collect_set(amount)
            )
        )
    ) AS amount
FROM transactions
GROUP BY category, sub_category
"""

df_result = spark.sql(sql_query)
'''

# Initialize Spark session
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Define schema
schema = StructType([
    StructField("category", StringType(), True),
    StructField("sub_category", StringType(), True),
    StructField("amount", IntegerType(), True)
])

# Sample data
data = [
    ("Electronics", "Laptop", 1000),
    ("Electronics", "Laptop", 1200),
    ("Furniture", "Chair", 200),
    ("Furniture", "Chair", 150),
    ("Furniture", "Chair", 180),
    ("Furniture", "Chair", 200),
    ("Electronics","iPhone", 600),
    ("Electronics","iPhone", 400),
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

df_result = (
  df.groupBy("category", "sub_category")
  .agg(
    F.array_sort( # sort ascending
      F.array_distinct( # remove duplicates
        F.collect_list(F.col("amount")) # gather all values
      )
    ).alias("amount_list")
  )
  .withColumn("amount", F.concat_ws(", ", F.col("amount_list"))) # concat with seperator
  .select("category", "sub_category", "amount")
)


# Display result
df_result.show()

+-----------+------------+-------------+
|   category|sub_category|       amount|
+-----------+------------+-------------+
|Electronics|      Laptop|   1000, 1200|
|  Furniture|       Chair|150, 180, 200|
|Electronics|      iPhone|     400, 600|
+-----------+------------+-------------+

