In [0]:
data = [
    (101, "Electronics", 1, 15000),
    (102, "Electronics", 2, 12000),
    (103, "Fashion", 3, 800),
    (104, "Fashion", 1, 1500),
    (105, "Grocery", 5, 200),
    (106, "Grocery", 2, 300),
    (107, "Electronics", 1, 50000),
    (108, "Fashion", 4, 600)
]

cols = ["order_id", "category", "quantity", "price"]

df = spark.createDataFrame(data, cols)
df.show()

+--------+-----------+--------+-----+
|order_id|   category|quantity|price|
+--------+-----------+--------+-----+
|     101|Electronics|       1|15000|
|     102|Electronics|       2|12000|
|     103|    Fashion|       3|  800|
|     104|    Fashion|       1| 1500|
|     105|    Grocery|       5|  200|
|     106|    Grocery|       2|  300|
|     107|Electronics|       1|50000|
|     108|    Fashion|       4|  600|
+--------+-----------+--------+-----+



## **_Aggregation Functions_**

In [0]:
# -------------------------------------------------------------------
# Importing all required PySpark SQL functions
# sum, max, min, avg, count, etc.
# -------------------------------------------------------------------
from pyspark.sql.functions import *

# -------------------------------------------------------------------
# Performing aggregation on the entire DataFrame
# agg() without groupBy() → applies aggregations on ALL rows
# -------------------------------------------------------------------
df.agg(

    # Sum of all price values
    sum("price").alias("total_price"),

    # Maximum price value
    max("price").alias("max_price"),

    # Minimum price value
    min("price").alias("min_price"),

    # Average of all price values
    avg("price").alias("avg_price"),

    # Count of non-null price values
    count("price").alias("total_record")

).show()


+-----------+---------+---------+---------+------------+
|total_price|max_price|min_price|avg_price|total_record|
+-----------+---------+---------+---------+------------+
|      80400|    50000|      200|  10050.0|           8|
+-----------+---------+---------+---------+------------+



## **_Group by_**

In [0]:
# -------------------------------------------------------------------
# Grouping data by "category" and applying aggregate functions
# groupBy() → groups rows based on the given column(s)
# agg()     → applies multiple aggregations on grouped data
# -------------------------------------------------------------------
df.groupBy("category").agg(

    # Total sum of price for each category
    sum("price").alias("total_price"),

    # Maximum price in each category
    max("price").alias("max_price"),

    # Minimum price in each category
    min("price").alias("min_price"),

    # Average price rounded to 2 decimal places
    round(avg("price"), 2).alias("avg_price"),

    # Count how many price records exist in each category
    count("price").alias("total_record")

).show()


+-----------+-----------+---------+---------+---------+------------+
|   category|total_price|max_price|min_price|avg_price|total_record|
+-----------+-----------+---------+---------+---------+------------+
|Electronics|      77000|    50000|    12000| 25666.67|           3|
|    Fashion|       2900|     1500|      600|   966.67|           3|
|    Grocery|        500|      300|      200|    250.0|           2|
+-----------+-----------+---------+---------+---------+------------+



In [0]:
# -------------------------------------------------------------------
# Performing aggregation grouped by "category"
# and filtering results based on aggregated values
# -------------------------------------------------------------------
df.groupBy("category").agg(

    # Total price for each category
    sum("price").alias("total_price"),

    # Highest price in each category
    max("price").alias("max_price"),

    # Lowest price in each category
    min("price").alias("min_price"),

    # Average price rounded to 2 decimal places
    round(avg("price"), 2).alias("avg_price"),

    # Total number of price records
    count("price").alias("total_record")

# -------------------------------------------------------------------
# Filtering after aggregation
# Keep only those categories where total_price > 5000
# (Filtering must come AFTER agg(), not before)
# -------------------------------------------------------------------
).filter(
    col("total_price") > 5000
).show()


+-----------+-----------+---------+---------+---------+------------+
|   category|total_price|max_price|min_price|avg_price|total_record|
+-----------+-----------+---------+---------+---------+------------+
|Electronics|      77000|    50000|    12000| 25666.67|           3|
+-----------+-----------+---------+---------+---------+------------+



## **_Windows Function_**

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

data = [
    (1, "Rahul", "Math", 78),
    (1, "Rahul", "Science", 75),
    (1, "Rahul", "English", 69),
    (2, "Priya", "Math", 92),
    (2, "Priya", "Science", 81),
    (2, "Priya", "English", 95),
    (3, "Amit", "Math", 65),
    (3, "Amit", "Science", 72),
    (3, "Amit", "English", 69),
    (4, "Sneha", "Math", 90),
    (4, "Sneha", "Science", 85),
    (4, "Sneha", "English", 78),
    (5, "Rohan", "Math", 90),
    (5, "Rohan", "Science", 75),
    (5, "Rohan", "English", 68),
]

columns = ["student_id", "name", "subject", "marks"]

df = spark.createDataFrame(data, columns)
df.show()

+----------+-----+-------+-----+
|student_id| name|subject|marks|
+----------+-----+-------+-----+
|         1|Rahul|   Math|   78|
|         1|Rahul|Science|   75|
|         1|Rahul|English|   69|
|         2|Priya|   Math|   92|
|         2|Priya|Science|   81|
|         2|Priya|English|   95|
|         3| Amit|   Math|   65|
|         3| Amit|Science|   72|
|         3| Amit|English|   69|
|         4|Sneha|   Math|   90|
|         4|Sneha|Science|   85|
|         4|Sneha|English|   78|
|         5|Rohan|   Math|   90|
|         5|Rohan|Science|   75|
|         5|Rohan|English|   68|
+----------+-----+-------+-----+



In [0]:
# -------------------------------------------------------------------
# Importing required functions and Window specification module
# -------------------------------------------------------------------
from pyspark.sql.functions import *
from pyspark.sql.window import Window


# -------------------------------------------------------------------
# Creating a window specification
# partitionBy("subject") → group rows by subject
# orderBy(marks desc)   → sort within each subject group by marks (highest first)
# This window will be applied to ranking functions
# -------------------------------------------------------------------
windowSpec = Window.partitionBy("subject").orderBy(col("marks").desc())


# -------------------------------------------------------------------
# Applying window functions to create new ranking columns
# 1) row_number()  → gives unique row number per partition (no ties)
# 2) rank()        → same rank for ties but gaps appear
# 3) dense_rank()  → same rank for ties but no gaps
# -------------------------------------------------------------------
df.withColumn("row_number", row_number().over(windowSpec)) \
  .withColumn("rank", rank().over(windowSpec)) \
  .withColumn("dense_rank", dense_rank().over(windowSpec)) \
  .show()


+----------+-----+-------+-----+----------+----+------+
|student_id| name|subject|marks|row_number|rank|d_rank|
+----------+-----+-------+-----+----------+----+------+
|         2|Priya|English|   95|         1|   1|     1|
|         4|Sneha|English|   78|         2|   2|     2|
|         1|Rahul|English|   69|         3|   3|     3|
|         3| Amit|English|   69|         4|   3|     3|
|         5|Rohan|English|   68|         5|   5|     4|
|         2|Priya|   Math|   92|         1|   1|     1|
|         4|Sneha|   Math|   90|         2|   2|     2|
|         5|Rohan|   Math|   90|         3|   2|     2|
|         1|Rahul|   Math|   78|         4|   4|     3|
|         3| Amit|   Math|   65|         5|   5|     4|
|         4|Sneha|Science|   85|         1|   1|     1|
|         2|Priya|Science|   81|         2|   2|     2|
|         1|Rahul|Science|   75|         3|   3|     3|
|         5|Rohan|Science|   75|         4|   3|     3|
|         3| Amit|Science|   72|         5|   5|

In [0]:
# ---------------------------------------------------------------
# Creating a window specification
# Partition the data by 'name' and order rows by marks (descending)
# This window will be used for LEAD, LAG, and window aggregation
# ---------------------------------------------------------------
windowspace = Window.partitionBy("name").orderBy(col("marks").desc())

df.withColumn(

    # LAG → Get previous row's marks within the same window (previous student attempt)
    "prev_mark", lag("marks").over(windowspace)

).withColumn(

    # Difference between current marks and previous marks
    # Useful for finding improvement or drop in marks
    "diff_m", col("marks") - col("prev_mark")

).withColumn(

    # LEAD → Get next row's marks within the same window (next student attempt)
    "next_mark", lead("marks").over(windowspace)

).withColumn(

    # Window aggregation: Total marks for each 'name'
    # SUM OVER WINDOW → Running total or category-wise total without collapsing rows
    "Total_marks", sum("marks").over(windowspace)

).show()


+----------+-----+-------+-----+---------+------+---------+-----------+
|student_id| name|subject|marks|prev_mark|diff_m|next_mark|Total_marks|
+----------+-----+-------+-----+---------+------+---------+-----------+
|         3| Amit|Science|   72|     NULL|  NULL|       69|         72|
|         3| Amit|English|   69|       72|    -3|       65|        141|
|         3| Amit|   Math|   65|       69|    -4|     NULL|        206|
|         2|Priya|English|   95|     NULL|  NULL|       92|         95|
|         2|Priya|   Math|   92|       95|    -3|       81|        187|
|         2|Priya|Science|   81|       92|   -11|     NULL|        268|
|         1|Rahul|   Math|   78|     NULL|  NULL|       75|         78|
|         1|Rahul|Science|   75|       78|    -3|       69|        153|
|         1|Rahul|English|   69|       75|    -6|     NULL|        222|
|         5|Rohan|   Math|   90|     NULL|  NULL|       75|         90|
|         5|Rohan|Science|   75|       90|   -15|       68|     

## **_Joins_**
- Inner Join : Only Matching Data From Both Datafrmae
- Outer join 
    - Left : Matching From Both Tables + Addional Rwos From Left table
    - Right : Matching From Both Tables + Addional Rwos From Right table
    - Full
- Self Join
- Cross Join
- Semi Join
- Anti Join      

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

students_data = [
    (1, "Rahul", "M"),
    (2, "Priya", "F"),
    (3, "Amit", "M"),
    (4, "Sneha", "F"),
    (6, "Kiran", "M"),  # no enrollment
]

courses_data = [
    (101, "Math"),
    (102, "Science"),
    (103, "English"),
    (104, "Computer"),
    (105, "Biology"),  # no enrollment
]

enrollments_data = [
    (1, 101, 78),
    (1, 102, 88),
    (2, 101, 92),
    (2, 103, 95),
    (3, 101, 65),
    (4, 103, 78),
    (5, 101, 70),  # student not in students table
]

students_cols = ["student_id", "name", "gender"]
courses_cols = ["course_id", "course_name"]
enrollments_cols = ["student_id", "course_id", "marks"]

students_df = spark.createDataFrame(students_data, students_cols)
courses_df = spark.createDataFrame(courses_data, courses_cols)
enrollments_df = spark.createDataFrame(enrollments_data, enrollments_cols)

students_df.show()
courses_df.show()
enrollments_df.show()


+----------+-----+------+
|student_id| name|gender|
+----------+-----+------+
|         1|Rahul|     M|
|         2|Priya|     F|
|         3| Amit|     M|
|         4|Sneha|     F|
|         6|Kiran|     M|
+----------+-----+------+

+---------+-----------+
|course_id|course_name|
+---------+-----------+
|      101|       Math|
|      102|    Science|
|      103|    English|
|      104|   Computer|
|      105|    Biology|
+---------+-----------+

+----------+---------+-----+
|student_id|course_id|marks|
+----------+---------+-----+
|         1|      101|   78|
|         1|      102|   88|
|         2|      101|   92|
|         2|      103|   95|
|         3|      101|   65|
|         4|      103|   78|
|         5|      101|   70|
+----------+---------+-----+



In [0]:
enrollments_df.join(students_df,"student_id","inner").show()

enrollments_df.join(students_df,"student_id","left").show()

enrollments_df.join(students_df,"student_id","right").show()

enrollments_df.join(students_df,"student_id","full").show() # all the rows from both the tables

enrollments_df.join(students_df,"student_id","left_semi").show() # left semi join --> matching from left table only - like the wich are in left table only

enrollments_df.join(students_df,"student_id","left_anti").show() # left anit join --> only non matching record from left table - the row which are not match from left table

+----------+---------+-----+-----+------+
|student_id|course_id|marks| name|gender|
+----------+---------+-----+-----+------+
|         1|      102|   88|Rahul|     M|
|         1|      101|   78|Rahul|     M|
|         2|      103|   95|Priya|     F|
|         2|      101|   92|Priya|     F|
|         3|      101|   65| Amit|     M|
|         4|      103|   78|Sneha|     F|
+----------+---------+-----+-----+------+

+----------+---------+-----+-----+------+
|student_id|course_id|marks| name|gender|
+----------+---------+-----+-----+------+
|         1|      101|   78|Rahul|     M|
|         1|      102|   88|Rahul|     M|
|         2|      101|   92|Priya|     F|
|         2|      103|   95|Priya|     F|
|         3|      101|   65| Amit|     M|
|         4|      103|   78|Sneha|     F|
|         5|      101|   70| NULL|  NULL|
+----------+---------+-----+-----+------+

+----------+---------+-----+-----+------+
|student_id|course_id|marks| name|gender|
+----------+---------+-----+----