In [1]:
import pathlib
from datetime import datetime
from typing import List, Tuple, Union, Dict

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Column

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
# master configuration to use only 4 CPU cores
spark = SparkSession.builder.master("local[4]").getOrCreate()

# basic configuration to use only a reasonable number of partitions
spark.conf.set("spark.sql.shuffle.partition", 4)

# configuration to work in UTC
spark.conf.set("spark.sql.session.timeZone", "UTC")

23/08/25 11:27:42 WARN Utils: Your hostname, Emilianos-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.130 instead (on interface en0)
23/08/25 11:27:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/08/25 11:27:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/25 11:27:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
df = spark.createDataFrame(
    [
        {"name": "John", "age": 25, "example_date": "2023-08-23"},
        {"name": "John", "age": 25, "example_date": "2023-08-24"},
        {"name": "John", "age": 26, "example_date": "2023-08-25"},
        {"name": "Mary", "age": 50, "example_date": "2023-08-23"},
        {"name": "Mary", "age": 50, "example_date": "2023-08-23"},
    ]
)

In [7]:
df.show()

+---+------------+----+
|age|example_date|name|
+---+------------+----+
| 25|  2023-08-23|John|
| 25|  2023-08-24|John|
| 26|  2023-08-25|John|
| 50|  2023-08-23|Mary|
| 50|  2023-08-23|Mary|
+---+------------+----+



In [8]:
# Define the window specification
window_spec = Window.partitionBy("name", "age")

# Add the max window function column
df_with_max = df.withColumn("max_date", F.max("example_date").over(window_spec))
df_with_max.show()

+---+------------+----+----------+
|age|example_date|name|  max_date|
+---+------------+----+----------+
| 25|  2023-08-23|John|2023-08-24|
| 25|  2023-08-24|John|2023-08-24|
| 26|  2023-08-25|John|2023-08-25|
| 50|  2023-08-23|Mary|2023-08-23|
| 50|  2023-08-23|Mary|2023-08-23|
+---+------------+----+----------+



In [16]:
class DummyConfig:
    def __init__(self, config: dict):
        self.config = config

    def __getitem__(self, key):
        return self.config[key]

    def get(self, key, default=None):
        return self.config.get(key, default)

In [20]:
def partition_by_max(
    df: DataFrame, config: Dict[str, Union[List[str], str]]
) -> DataFrame:
    """
    Adds a new column to the DataFrame with the maximum value of a specified column within each partition.

    Args:
        df (DataFrame): The input DataFrame.
        config (dict): A dictionary containing configuration parameters.
            - partition_cols (List[str]): List of column names to partition by.
            - max_column (str): The column for which the maximum value will be calculated within each partition.
            - column_name (str): The name of the new column that will contain the maximum values.

    Returns:
        DataFrame: A new DataFrame with the added column containing the maximum values.
    """
    partition_cols = config["partition_cols"]
    max_column = config["max_column"]
    column_name = config["column_name"]

    window_spec = Window.partitionBy(*partition_cols)
    df = df.withColumn(column_name, F.max(max_column).over(window_spec))
    return df


config = DummyConfig(
    {
        "partition_cols": ["name", "age"],
        "max_column": "example_date",
        "column_name": "max_date",
    }
)

# Call the function to add the max window function column
output_df = partition_by_max(df, config)

In [21]:
output_df.show()

+---+------------+----+----------+
|age|example_date|name|  max_date|
+---+------------+----+----------+
| 25|  2023-08-23|John|2023-08-24|
| 25|  2023-08-24|John|2023-08-24|
| 26|  2023-08-25|John|2023-08-25|
| 50|  2023-08-23|Mary|2023-08-23|
| 50|  2023-08-23|Mary|2023-08-23|
+---+------------+----+----------+

