In [0]:
import pyspark.sql.functions as f
from pyspark.sql.functions import min, max, col
from pyspark.sql.types import DateType

**EX 01**

In [0]:
# Sample Data
data = [
    (1, "ITC", 59000, "2024-01-15"),
    (2, "BEML", 68000, "2023-12-10"),
    (3, "HCL", 53500, "2022-06-25"),
    (4, "AIRTEL", 77800, "2021-09-30"),
    (5, "ACT", 5550, "2024-05-15"),
    (6, "TATA", 95600, "2023-09-15"),
    (7, "BEML", 87500, "2025-02-05"),
    (8, "AIRTEL", 95600, "2021-06-20"),
    (9, "ACT", 65000, "2024-02-04"),
    (10, "ITC", 36700, "2022-09-08"),
    (11, "TATA", 175600, "2023-06-15"),
    (12, "ITC", 98700, "2022-12-18"),
    (13, "BEML", 99550, "2023-01-22"),
    (14, "AIRTEL", 395800, "2020-02-23")
]

# Define Schema
columns = ["ID", "Company", "Salary", "JoiningDate"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df = df.withColumn("JoiningDate", col("JoiningDate").cast(DateType()))
display(df)

ID,Company,Salary,JoiningDate
1,ITC,59000,2024-01-15
2,BEML,68000,2023-12-10
3,HCL,53500,2022-06-25
4,AIRTEL,77800,2021-09-30
5,ACT,5550,2024-05-15
6,TATA,95600,2023-09-15
7,BEML,87500,2025-02-05
8,AIRTEL,95600,2021-06-20
9,ACT,65000,2024-02-04
10,ITC,36700,2022-09-08


**1) Using agg() Function**

In [0]:
# Compute Max and Min using agg()
df.agg(max("Salary").alias("Max_Salary"), min("Salary").alias("Min_Salary")).display()
df.agg(max("JoiningDate").alias("Latest_JoiningDate"), min("JoiningDate").alias("Earliest_JoiningDate")).display()

Max_Salary,Min_Salary
395800,5550


Latest_JoiningDate,Earliest_JoiningDate
2025-02-05,2020-02-23


**2) Using select() with max() and min()**

In [0]:
df.select(max("Salary").alias("Max_Salary"), min("Salary").alias("Min_Salary")).display()
df.select(max("JoiningDate").alias("Latest_JoiningDate"), min("JoiningDate").alias("Earliest_JoiningDate")).display()

Max_Salary,Min_Salary
395800,5550


Latest_JoiningDate,Earliest_JoiningDate
2025-02-05,2020-02-23


**3) Using groupBy()**

In [0]:
df.groupBy("Company").agg(max("Salary").alias("Max_Salary"), min("Salary").alias("Min_Salary")).display()

Company,Max_Salary,Min_Salary
ITC,98700,36700
BEML,99550,68000
HCL,53500,53500
AIRTEL,395800,77800
ACT,65000,5550
TATA,175600,95600


**4) Using describe()**
- For **Numerical Columns**
- If your column is numeric, describe() provides statistics:
  - **count, mean, stddev, min, and max**.

In [0]:
df.select("Salary").describe().display()

summary,Salary
count,14.0
mean,100992.85714285714
stddev,93194.9339518287
min,5550.0
max,395800.0


**Summary of Methods:**

      |-------------------------------|---------------------|-----------------------|--------------------------------|
      |     Method	                  | Works for Integers	| Works for Dates	| Returns                        |
      |-------------------------------|---------------------|-----------------------|--------------------------------|
      | agg(max(), min())             | ✅ Yes              | ✅ Yes	        | Single row with min/max        |
      | select(max(), min())	  | ✅ Yes	        | ✅ Yes	        | Single row with min/max        |
      | groupBy().agg(max(), min())	  | ✅ Yes              | ✅ Yes	        | Min/max per group              |
      | describe()	                  | ✅ Yes	        | ❌ No	                | Statistics including min/max   |
      |-------------------------------|---------------------|-----------------------|--------------------------------|

**EX 02**

In [0]:
df_sam = spark.read.csv("/FileStore/tables/random_data-2.csv", header=True, inferSchema=True)
display(df_sam)

Company_Name,Product_Id,Product_Version_Id,Cust_Name,Category,Start_Date,Start_Cust_Date,End_Date,Updated_Date,Cust_Value,Cust_Type,Exchange,Location,Last_Date_UTC,Cust_Category,Index,impact1,impact2,impact3,Base_Start_Date,Base_End_Date,Base_Expiration_Date,Base_Last_Sales_Date
Sony,1,650,Naresh,Standard,3-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,30,STD,EUR,IND,1720000000000.0,SETTL,True,0,1,,1/5/2024,2024-05-31,2024-04-29,2024-04-29
Sony,2,651,kamal,Standard,6-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,25,STD,EUR,IND,1720000000000.0,TOI,False,0,1,,1/5/2024,2024-05-31,2024-04-29,2024-04-29
Sony,3,652,kajal,Standard,9-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,28,STD,EUR,IND,1720000000000.0,TOI,False,0,1,,1/10/2024,2024-10-31,2024-09-27,2024-09-27
Sony,4,653,kiran,Standard,3-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,31,STD,EUR,IND,1720000000000.0,TOI,False,0,1,,1/10/2024,2024-10-31,2024-09-27,2024-09-27
Sony,5,654,sam,Standard,8-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,34,STD,EUR,IND,1720000000000.0,TOI,False,0,1,,1/3/2024,2024-03-31,2024-03-28,2024-03-28
Sony,6,655,sourab,Standard,9-Jan-24,1730000000000.0,1740000000000.0,1730000000000.0,37,STD,EUR,IND,1720000000000.0,TOI,True,0,1,,1/3/2024,2024-03-31,2024-03-28,2024-03-28
Sony,7,656,jai,Upper,3-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,40,STD,EUR,IND,1720000000000.0,TOI,True,0,1,,1/4/2024,2024-04-30,2024-04-29,2024-04-29
BPL,8,657,sree,Upper,6-Mar-23,1730000000000.0,1730000000000.0,1730000000000.0,43,STD,EUR,IND,1720000000000.0,SETTL,True,0,1,,1/4/2024,2024-04-30,2024-04-29,2024-04-29
BPL,9,658,sreenath,Upper,9-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,46,STD,EUR,IND,1720000000000.0,SETTL,True,0,1,,1/5/2024,2024-05-31,2024-05-30,2024-05-30
BPL,10,659,kamaesh,Upper,3-Jan-25,1740000000000.0,1740000000000.0,1730000000000.0,49,STD,EUR,IND,1720000000000.0,SETTL,False,0,1,,1/5/2024,2024-05-31,2024-05-30,2024-05-30


     # aggregate
     min_value_custValue = df.agg({'Cust_Value': 'min'})
     max_value_custValue = df.agg({'Cust_Value': 'max'})
                              (or)
     min_value_custValue = df.agg({'Cust_Value': 'min', 'Product_Version_Id': 'min'})
     max_value_custValue = df.agg({'Cust_Value': 'max', 'Product_Version_Id': 'max'})
                              (or)
     min_max_custValue = df.agg(max('Cust_Value').alias('max_sal'), min('Cust_Value').alias('min_sal'))
     
     # collect
     min_value_custValue = df.select(min('Cust_Value')).collect()[0][0]
     max_value_custValue = df.select(max('Cust_Value')).collect()[0][0] 
                              (or)
     min_value_custValue = df.agg(min('Cust_Value').alias('min_Cust_Value')).collect()[0][0]
     max_value_custValue = df.agg(max('Cust_Value').alias('max_Cust_Value')).collect()[0][0]
                              (or)
     min_value_custValue = df.agg(min('Cust_Value').alias('min_Cust_Value')).collect()[0]['min_Cust_Value']
     max_value_custValue = df.agg(max('Cust_Value').alias('max_Cust_Value')).collect()[0]['max_Cust_Value']                      
     
     # first
     min_value_custValue = df.agg(min('Cust_Value').alias('min_Cust_Value')).first()['min_Cust_Value']
     max_value_custValue = df.agg(max('Cust_Value').alias('max_Cust_Value')).first()['max_Cust_Value']

**1) Integer Columns**

**a) Using select() with max() and min()**

In [0]:
min_value_custValue = df_sam.select(min('Cust_Value').alias('min_Cust_Value'))
max_value_custValue = df_sam.select(max('Cust_Value').alias('max_Cust_Value'))
min_max_custValue = df_sam.agg(max('Cust_Value').alias('max_sal'), min('Cust_Value').alias('min_sal'))

display(min_value_custValue)
display(max_value_custValue)
display(min_max_custValue)

min_Cust_Value
25


max_Cust_Value
166


max_sal,min_sal
166,25


- In PySpark, the **collect()** method is used to retrieve the results of a **DataFrame action** from the **cluster to the local machine**.
- When you use **select(min('Cust_Value'))**, it creates a **DataFrame** with the minimum value of the Cust_Value column, but it does not actually execute the query and retrieve the result **until you call an action like collect()**.

In [0]:
min_value_custValue = df_sam.select(min('Cust_Value')).collect()
max_value_custValue = df_sam.select(max('Cust_Value')).collect()

print(f"Minimum value: {min_value_custValue}")
print(f"Maximum value: {max_value_custValue}")

Minimum value: [Row(min(Cust_Value)=25)]
Maximum value: [Row(max(Cust_Value)=166)]


In [0]:
min_value_custValue = df_sam.select(min('Cust_Value')).collect()[0][0]
max_value_custValue = df_sam.select(max('Cust_Value')).collect()[0][0]

print(f"Minimum value: {min_value_custValue}")
print(f"Maximum value: {max_value_custValue}")

Minimum value: 25
Maximum value: 166


- If you want to get the minimum value **without using collect()**, you can use the **agg** method combined with **min** to perform the aggregation and then use **first()** to retrieve the result.

  - **agg(min('Cust_Value').alias('min_Cust_Value'))** performs the **aggregation** to find the **minimum value**.
  - **first()** retrieves the **first row** of the result.
  - **['min_Cust_Value']** accesses the value of the min_Cust_Value column from the row.

**b) Using agg() Function**

In [0]:
min_value_custValue = df_sam.agg(min('Cust_Value').alias('min_Cust_Value')).first()['min_Cust_Value']
print(min_value_custValue)

25


In [0]:
df_sam.agg(min('Cust_Value').alias('min_Cust_Value')).display()

min_Cust_Value
25


In [0]:
df_sam.agg(min('Cust_Value').alias('min_Cust_Value')).first()

Row(min_Cust_Value=25)

In [0]:
df_sam.agg(min('Cust_Value').alias('min_Cust_Value')).first()['min_Cust_Value']

25

In [0]:
min_value_custValue_collect = df_sam.agg(min('Cust_Value')).collect()[0][0]
print(min_value_custValue_collect)

25


**df.select(min('Cust_Value'))**

- It is typically used to fetch a **scalar result (a single value)** in the context of the DataFrame without requiring explicit grouping.

**df.agg(min('Cust_Value').alias('min_Cust_Value'))**

- This is a more formal **aggregation operation** where you use the agg function to **compute summary statistics**, such as the minimum of Cust_Value.
- **Use Case:** This is commonly used when computing **multiple aggregations (e.g., min, max, avg)** or when you want a labeled result for the aggregated values.

**b) Date**

**Using agg() Function**

In [0]:
# Find the minimum and maximum Start_Date
min_date_start = df_sam.agg(f.min("Start_Date")).collect()[0][0]
max_date_start = df_sam.agg(f.max("Start_Date")).collect()[0][0]

print(f"Minimum value: {min_date_start}")
print(f"Maximum value: {max_date_start}")

Minimum value: 1-Apr-23
Maximum value: 9-Mar-23


In [0]:
# Find the minimum and maximum Base_Start_Date
min_date_base_start = df_sam.agg(f.min("Base_Start_Date")).collect()[0][0]
max_date_base_start = df_sam.agg(f.max("Base_Start_Date")).collect()[0][0]

print(f"Minimum value: {min_date_base_start}")
print(f"Maximum value: {max_date_base_start}")

Minimum value: 1/1/2025
Maximum value: 1/8/2024


In [0]:
# Find the minimum and maximum Base_End_Date
min_date_base_end = df_sam.agg(f.min("Base_End_Date")).collect()[0][0]
max_date_base_end = df_sam.agg(f.max("Base_End_Date")).collect()[0][0]

print(f"Minimum value: {min_date_base_end}")
print(f"Maximum value: {max_date_base_end}")

Minimum value: 2024-03-31
Maximum value: 2025-12-31


In [0]:
# Find the minimum and maximum Base_Expiration_Date
min_date_base_exp = df_sam.agg(f.min("Base_Expiration_Date")).collect()[0][0]
max_date_base_exp = df_sam.agg(f.max("Base_Expiration_Date")).collect()[0][0]

print(f"Minimum value: {min_date_base_exp}")
print(f"Maximum value: {max_date_base_exp}")

Minimum value: 2024-02-28
Maximum value: 2024-12-27


In [0]:
# Find the minimum and maximum Base_Last_Sales_Date
min_date_base_last = df_sam.agg(f.min("Base_Last_Sales_Date")).collect()[0][0]
max_date_base_last = df_sam.agg(f.max("Base_Last_Sales_Date")).collect()[0][0]

print(f"Minimum value: {min_date_base_last}")
print(f"Maximum value: {max_date_base_last}")

Minimum value: 2024-02-28
Maximum value: 2025-12-31
