In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("EmployeeData").getOrCreate()

data = [
    (1, "HR", 60000),
    (2, "HR", 75000),
    (3, "HR", 50000),
    (4, "IT", 90000),
    (5, "IT", 85000)
]

columns = ["emp_id", "dept", "salary"]

df = spark.createDataFrame(data, columns)

df.show()

win1 = Window.partitionBy('dept').orderBy(col('salary').desc())

df1 = df.withColumn('rank1', rank().over(win1))
df1.show()

+------+----+------+
|emp_id|dept|salary|
+------+----+------+
|     1|  HR| 60000|
|     2|  HR| 75000|
|     3|  HR| 50000|
|     4|  IT| 90000|
|     5|  IT| 85000|
+------+----+------+

+------+----+------+-----+
|emp_id|dept|salary|rank1|
+------+----+------+-----+
|     2|  HR| 75000|    1|
|     1|  HR| 60000|    2|
|     3|  HR| 50000|    3|
|     4|  IT| 90000|    1|
|     5|  IT| 85000|    2|
+------+----+------+-----+



In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("ProductSales").getOrCreate()

data = [
    ("P1", "2025-01-10"), ("P1", "2025-02-12"), ("P1", "2025-03-09"),
    ("P1", "2025-04-18"), ("P1", "2025-05-03"), ("P1", "2025-06-27"),
    ("P1", "2025-07-14"), ("P1", "2025-08-21"), ("P1", "2025-09-02"),
    ("P1", "2025-10-11"), ("P1", "2025-11-06"), ("P1", "2025-12-05"),

    ("P2", "2025-01-05"), ("P2", "2025-02-10"), ("P2", "2025-03-15"),
    ("P2", "2025-05-20"), ("P2", "2025-06-08"), ("P2", "2025-07-22"),
    ("P2", "2025-08-30"), ("P2", "2025-10-01"), ("P2", "2025-11-19"),
    ("P2", "2025-12-07"),

    ("P3", "2025-01-02"), ("P3", "2025-02-14"), ("P3", "2025-03-03"),
    ("P3", "2025-04-25"), ("P3", "2025-05-09"), ("P3", "2025-06-16"),
    ("P3", "2025-07-07"), ("P3", "2025-08-12"), ("P3", "2025-09-28"),
    ("P3", "2025-10-20"), ("P3", "2025-11-03"), ("P3", "2025-12-29"),
]

columns = ["product_id", "sale_date"]

df = spark.createDataFrame(data, columns)
# df.show(40, truncate=False)
df1 = df.select('product_id', date_format('sale_date', 'yyyy-MM').alias('month'))
# df1.show()

df2 = df1.groupBy('product_id').agg(countDistinct('month').alias("month_count"))
df2.show()

res = df2.filter(col('month_count')==12).select('product_id').sort('product_id')
res.show()

+----------+-----------+
|product_id|month_count|
+----------+-----------+
|        P2|         10|
|        P3|         12|
|        P1|         12|
+----------+-----------+

+----------+
|product_id|
+----------+
|        P1|
|        P3|
+----------+

