**first:**
- is one of **window function** which returns **first value** of a column of each window.

      df.withColumn("FirstValue", first("columnA").over(Window.partitionBy("ColumnB").orderBy("ColumnC")))

**last:**
- is one of **window function** which returns **last value** of a column of each window.

      df.withColumn("LastValue", last("columnA").over(Window.partitionBy("ColumnB").orderBy("ColumnC")))

**ignorenulls:** Column or str
- if **first value** is **null** then look for **first non-null value**.

**return first value**

     df.select(first("salary"))     
     df.select(first(df.salary, ignorenulls=True)) # To return the first non-null value instead

**group by & aggregate**

     df.groupBy("department") \
       .agg(first("salary").alias("First_Salary"), last("salary").alias("Last_Salary"))

     df.groupBy("department") \
       .agg(first("salary", ignorenulls=True).alias("First_Salary"), last("salary", ignorenulls=True).alias("Last_Salary")) \
       .orderBy("department")

**window function**

     from pyspark.sql.window import Window
     df_window_null = df.withColumn("first_salary", first("salary").over(Window.partitionBy("department"))) \
                        .withColumn("last_salary", last("salary").over(Window.partitionBy("department"))) \
                        .orderBy("department")

     df_window_ignore = df.withColumn("first_salary", first("salary", ignorenulls=True).over(Window.partitionBy("department"))) \
                          .withColumn("last_salary", last("salary", ignorenulls=True).over(Window.partitionBy("department"))) \
                          .orderBy("department")

In [0]:
from pyspark.sql.functions import first, last, to_date, col

#### **PySpark**

In [0]:
data = [("Prakash", "IT", 8000, "2023-03-15"),
        ("Syamala", "Finance", 7600, "2023-04-16"),
        ("Ritesh", "IT", 5100, "2023-05-10"),	
        ("Robert", "Marketing", 4000, "2023-06-25"),
        ("Harsha", "Sales", 2000, "2023-07-27"),
        ("Harsha", "Sales", None, "2023-08-11"),
        ("Senthil", "Finance", 3500, "2023-09-12"),
        ("Parthiv", "IT", 4900, "2023-10-13"),
        ("Prabhav", "Marketing", 4000, "2023-11-19"),
        ("Prabhav", "Marketing", None, "2023-12-20"),
        ("Pandya", "IT", 3000, "2023-01-01"),
        ("Anil", "Sales", 5100, "2024-10-04"),
        ("Anil", "Sales", None, "2024-09-08")
        ]
schema = ["employee_name", "department", "salary", "start_date"]

df = spark.createDataFrame(data, schema)

# convert the "date" data type
df = df.withColumn("start_date", to_date(col("start_date")))
display(df)

employee_name,department,salary,start_date
Prakash,IT,8000.0,2023-03-15
Syamala,Finance,7600.0,2023-04-16
Ritesh,IT,5100.0,2023-05-10
Robert,Marketing,4000.0,2023-06-25
Harsha,Sales,2000.0,2023-07-27
Harsha,Sales,,2023-08-11
Senthil,Finance,3500.0,2023-09-12
Parthiv,IT,4900.0,2023-10-13
Prabhav,Marketing,4000.0,2023-11-19
Prabhav,Marketing,,2023-12-20


**1) Using first() and last() with orderBy()**

In [0]:
df = df.orderBy(col("salary"))
display(df)

employee_name,department,salary,start_date
Harsha,Sales,,2023-08-11
Prabhav,Marketing,,2023-12-20
Anil,Sales,,2024-09-08
Harsha,Sales,2000.0,2023-07-27
Pandya,IT,3000.0,2023-01-01
Senthil,Finance,3500.0,2023-09-12
Robert,Marketing,4000.0,2023-06-25
Prabhav,Marketing,4000.0,2023-11-19
Parthiv,IT,4900.0,2023-10-13
Anil,Sales,5100.0,2024-10-04


In [0]:
# First and Last based on ordering by Salary
df.select(first("salary").alias("First_Salary"), last("salary").alias("Last_Salary")).display()

First_Salary,Last_Salary
,8000


**2) Using first() and last() with select()**

In [0]:
# Returns the first row as a Row
df.first()

Row(employee_name='Harsha', department='Sales', salary=None, start_date=datetime.date(2023, 8, 11))

In [0]:
# Using first() function
df.select(first("salary")).display()	

first(salary)
""


In [0]:
# To return the first non-null value instead:
df.select(first(df.salary, ignorenulls=True)).display()

first(salary)
2000


In [0]:
# Using last() function
df.select(last("salary")).display()

last(salary)
8000


**3) Using first() and last() with groupBy()**

In [0]:
df.orderBy(col("department"), col("salary")).display()

employee_name,department,salary,start_date
Senthil,Finance,3500.0,2023-09-12
Syamala,Finance,7600.0,2023-04-16
Pandya,IT,3000.0,2023-01-01
Parthiv,IT,4900.0,2023-10-13
Ritesh,IT,5100.0,2023-05-10
Prakash,IT,8000.0,2023-03-15
Prabhav,Marketing,,2023-12-20
Prabhav,Marketing,4000.0,2023-11-19
Robert,Marketing,4000.0,2023-06-25
Anil,Sales,,2024-09-08


In [0]:
df.groupBy("department") \
  .agg(first("salary").alias("First_Salary"), last("salary").alias("Last_Salary")) \
  .display()

department,First_Salary,Last_Salary
Sales,,5100
Finance,3500.0,7600
Marketing,,4000
IT,3000.0,8000


In [0]:
df.groupBy("department") \
  .agg(first("salary", ignorenulls=True).alias("First_Salary"), last("salary", ignorenulls=True).alias("Last_Salary")) \
  .orderBy("department") \
  .display()

department,First_Salary,Last_Salary
Finance,3500,7600
IT,3000,8000
Marketing,4000,4000
Sales,2000,5100


In [0]:
from pyspark.sql.window import Window
df_window_null = df.withColumn("first_salary", first("salary").over(Window.partitionBy("department"))) \
                   .withColumn("last_salary", last("salary").over(Window.partitionBy("department"))) \
                   .orderBy("department")
display(df_window_null)

employee_name,department,salary,start_date,first_salary,last_salary
Senthil,Finance,3500.0,2023-09-12,3500.0,7600
Syamala,Finance,7600.0,2023-04-16,3500.0,7600
Pandya,IT,3000.0,2023-01-01,3000.0,8000
Parthiv,IT,4900.0,2023-10-13,3000.0,8000
Ritesh,IT,5100.0,2023-05-10,3000.0,8000
Prakash,IT,8000.0,2023-03-15,3000.0,8000
Prabhav,Marketing,,2023-12-20,,4000
Robert,Marketing,4000.0,2023-06-25,,4000
Prabhav,Marketing,4000.0,2023-11-19,,4000
Harsha,Sales,,2023-08-11,,5100


In [0]:
df_window_ignore = df.withColumn("first_salary", first("salary", ignorenulls=True).over(Window.partitionBy("department"))) \
                     .withColumn("last_salary", last("salary", ignorenulls=True).over(Window.partitionBy("department"))) \
                     .orderBy("department")
display(df_window_ignore)

employee_name,department,salary,start_date,first_salary,last_salary
Senthil,Finance,3500.0,2023-09-12,3500,7600
Syamala,Finance,7600.0,2023-04-16,3500,7600
Pandya,IT,3000.0,2023-01-01,3000,8000
Parthiv,IT,4900.0,2023-10-13,3000,8000
Ritesh,IT,5100.0,2023-05-10,3000,8000
Prakash,IT,8000.0,2023-03-15,3000,8000
Prabhav,Marketing,,2023-12-20,4000,4000
Robert,Marketing,4000.0,2023-06-25,4000,4000
Prabhav,Marketing,4000.0,2023-11-19,4000,4000
Harsha,Sales,,2023-08-11,2000,5100


In [0]:
df_window = df.withColumn("first_start_date", first("start_date").over(Window.partitionBy("department"))) \
              .withColumn("last_start_date", last("start_date").over(Window.partitionBy("department"))) \
              .orderBy("department")
display(df_window)

employee_name,department,salary,start_date,first_start_date,last_start_date
Senthil,Finance,3500.0,2023-09-12,2023-09-12,2023-04-16
Syamala,Finance,7600.0,2023-04-16,2023-09-12,2023-04-16
Pandya,IT,3000.0,2023-01-01,2023-01-01,2023-03-15
Parthiv,IT,4900.0,2023-10-13,2023-01-01,2023-03-15
Ritesh,IT,5100.0,2023-05-10,2023-01-01,2023-03-15
Prakash,IT,8000.0,2023-03-15,2023-01-01,2023-03-15
Prabhav,Marketing,,2023-12-20,2023-12-20,2023-11-19
Robert,Marketing,4000.0,2023-06-25,2023-12-20,2023-11-19
Prabhav,Marketing,4000.0,2023-11-19,2023-12-20,2023-11-19
Harsha,Sales,,2023-08-11,2023-08-11,2024-10-04


#### **Spark SQL**

In [0]:
# Convert DataFrame to temparory view
df.createOrReplaceTempView("transaction")

In [0]:
spark_sql = spark.sql("""SELECT FIRST(salary) AS First_Salary, LAST(salary) AS Last_Salary,
                                FIRST(start_date) AS First_start_date, LAST(start_date) AS Last_start_date
                         FROM transaction
                      """)

display(spark_sql)

First_Salary,Last_Salary,First_start_date,Last_start_date
,8000,2023-08-11,2023-03-15


In [0]:
%sql
SELECT DISTINCT department,
       FIRST(salary) OVER (PARTITION BY department) AS first_salary,
       LAST(salary) OVER (PARTITION BY department) AS last_salary,
       FIRST(start_date) OVER (PARTITION BY department) AS first_start_date,
       LAST(start_date) OVER (PARTITION BY department) AS last_start_date
FROM transaction
ORDER BY department;

department,first_salary,last_salary,first_start_date,last_start_date
Finance,3500.0,7600,2023-09-12,2023-04-16
IT,3000.0,8000,2023-01-01,2023-03-15
Marketing,,4000,2023-12-20,2023-11-19
Sales,,5100,2023-08-11,2024-10-04
