In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("purchase_date", StringType(), True),
    StructField("amount", IntegerType(), True)
])

data = [
    (101, "2025-01-03", 250),
    (101, "2025-01-05", 300),
    (102, "2025-01-01", 150),
    (102, "2025-01-02", 200),
    (103, "2025-01-04", 500),
]

ddf = spark.createDataFrame(data, schema)
# ddf.show()

window_date = Window.partitionBy('customer_id').orderBy('purchase_date')

df_date = ddf.withColumn("rn", row_number().over(window_date)) 
# df_date.show()
ddf_date = df_date.filter(col("rn")==1).select('customer_id', 'purchase_date', 'amount')
ddf_date.show()

+-----------+-------------+------+
|customer_id|purchase_date|amount|
+-----------+-------------+------+
|        101|   2025-01-03|   250|
|        102|   2025-01-01|   150|
|        103|   2025-01-04|   500|
+-----------+-------------+------+



In [0]:
schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("effective_date", StringType(), True),
    StructField("salary", IntegerType(), True)
])

data = [
    (1, "2025-01-01", 50000),
    (1, "2025-02-01", 55000),
    (1, "2025-03-01", 60000),
    (2, "2025-01-15", 40000),
    (2, "2025-03-01", 45000),
    (3, "2025-01-10", 30000),
]

df_emp = spark.createDataFrame(data, schema)
# df_emp.show()

window_emp = Window.partitionBy('emp_id').orderBy('effective_date')

ddf_emp = df_emp.withColumn('lag_sal', lag('salary').over(window_emp))
# ddf_emp.show()
ddf_emp1 = ddf_emp.withColumn(
    "changed",
    when(col("salary") != col("lag_sal"), 1).otherwise(0)
)
ddf_emp1.show()

ddf_emp2 = ddf_emp1.groupBy('emp_id').agg(sum(col('changed')).alias('salary_change_count')).filter(col('salary_change_count')>=1)
ddf_emp2.show()

+------+--------------+------+-------+-------+
|emp_id|effective_date|salary|lag_sal|changed|
+------+--------------+------+-------+-------+
|     1|    2025-01-01| 50000|   null|      0|
|     1|    2025-02-01| 55000|  50000|      1|
|     1|    2025-03-01| 60000|  55000|      1|
|     2|    2025-01-15| 40000|   null|      0|
|     2|    2025-03-01| 45000|  40000|      1|
|     3|    2025-01-10| 30000|   null|      0|
+------+--------------+------+-------+-------+

+------+-------------------+
|emp_id|salary_change_count|
+------+-------------------+
|     1|                  2|
|     2|                  1|
+------+-------------------+

