In [0]:
data = [
    ("Electronics", "Laptop", 1200),
    ("Electronics", "Phone", 900),
    ("Electronics", "Tablet", 700),
    ("Clothing", "Shirt", 400),
    ("Clothing", "Jeans", 600),
    ("Clothing", "Jacket", 800),
]

schema = ["category", "product", "sales"]

df1 = spark.createDataFrame(data, schema)
# df1.show()

# wind1 = Window.groupBy('category').orderBy(desc('sales'))  #AttributeError: type object 'Window' has no attribute 'groupBy'
wind1 = Window.partitionBy('category').orderBy(desc('sales'))

ranked_df = df1.withColumn("rank", rank().over(wind1))
ranked_df.show()

top_2 = ranked_df.filter(col("rank") <= 2)

top_2.show()
top_2.drop('rank').show()

+-----------+-------+-----+----+
|   category|product|sales|rank|
+-----------+-------+-----+----+
|   Clothing| Jacket|  800|   1|
|   Clothing|  Jeans|  600|   2|
|   Clothing|  Shirt|  400|   3|
|Electronics| Laptop| 1200|   1|
|Electronics|  Phone|  900|   2|
|Electronics| Tablet|  700|   3|
+-----------+-------+-----+----+

+-----------+-------+-----+----+
|   category|product|sales|rank|
+-----------+-------+-----+----+
|   Clothing| Jacket|  800|   1|
|   Clothing|  Jeans|  600|   2|
|Electronics| Laptop| 1200|   1|
|Electronics|  Phone|  900|   2|
+-----------+-------+-----+----+

+-----------+-------+-----+
|   category|product|sales|
+-----------+-------+-----+
|   Clothing| Jacket|  800|
|   Clothing|  Jeans|  600|
|Electronics| Laptop| 1200|
|Electronics|  Phone|  900|
+-----------+-------+-----+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

data = [
    (1, "2025-01-01", 500),
    (1, "2025-01-03", -200),
    (1, "2025-01-05", 300),
    (2, "2025-01-02", 1000),
    (2, "2025-01-04", -400),
]

columns = ["user_id", "txn_date", "amount"]
df = spark.createDataFrame(data, columns)
# df.show()

df = df.withColumn("txn_date", to_date("txn_date"))

window1 = Window.partitionBy("user_id").orderBy("txn_date") \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)
# all rows from the first row up to the current row
result = df.withColumn("running_balance", sum(col("amount")).over(window1))
result.show()

+-------+----------+------+---------------+
|user_id|  txn_date|amount|running_balance|
+-------+----------+------+---------------+
|      1|2025-01-01|   500|            500|
|      1|2025-01-03|  -200|            300|
|      1|2025-01-05|   300|            600|
|      2|2025-01-02|  1000|           1000|
|      2|2025-01-04|  -400|            600|
+-------+----------+------+---------------+



In [0]:
df.createOrReplaceTempView("transactions")

In [0]:
%sql
select 
    user_id,
    txn_date,
    amount,
    sum(amount) over (
        partition by user_id 
        order by txn_date 
        rows between unbounded preceding and current row
    ) as running_balance
from transactions;

user_id,txn_date,amount,running_balance
1,2025-01-01,500,500
1,2025-01-03,-200,300
1,2025-01-05,300,600
2,2025-01-02,1000,1000
2,2025-01-04,-400,600
