In [33]:
from pyspark.sql import SparkSession
from datetime import date, datetime
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import typing as T
from pyspark.sql import Column, DataFrame

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0)),
    (4, 5., 'string3', date(2000, 2, 1), datetime(2000, 1, 3, 12, 0)),
], schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [34]:
df.select("c", "d", F.sum(F.sum(F.col("a")).over(Window.partitionBy("c"))).over(Window.partitionBy("d")).alias("sum_a")).toPandas()

Unnamed: 0,c,d,sum_a
0,string1,2000-01-01,1
1,string2,2000-02-01,9
2,string3,2000-02-01,9
3,string3,2000-03-01,7


In [4]:
df.groupBy("a")._df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [None]:
import re

w = Window.orderBy("titi").rowsBetween(Window.unboundedPreceding, Window.currentRow).partitionBy("toto")
w2 = Window.partitionBy("tutu")

str(F.row_number().over(w).name())

col = F.sum(F.row_number().over(w)).over(w2) % F.lit(10) # F.sum(F.row_number().over(w)).over(w)
col_expr = str(col._jc)
print(col_expr)
# F.lit("a")


%(sum(row_number() OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) OVER (PARTITION BY tutu), 10)


In [27]:
str(F.row_number().over(w).over(Window.orderBy("azerty").partitionBy("tutu"))._jc)

'row_number() OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) OVER (PARTITION BY tutu ORDER BY azerty ASC NULLS FIRST)'

In [53]:
list(re.finditer(r' OVER \(', col_expr, re.IGNORECASE))

[<re.Match object; span=(18, 25), match=' OVER ('>,
 <re.Match object; span=(123, 130), match=' OVER ('>]

In [None]:
import itertools
list(itertools.accumulate(l))

In [83]:
col_expr[24:123]

'(PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))'

In [20]:
def find_closing_parenthesis_idx(s: str):
    if s[0] != '(':
        return -1
    stack = 0
    for i, c in enumerate(s):
        if c == '(':
            stack += 1
        elif c == ')':
            stack -= 1
            if stack == 0:
                return i
    return -1


ColumnOrName = T.Union[Column, str]

def find_partitions_expr(col: ColumnOrName) -> T.List[str]:
    match col:
        case Column():
            return find_partitions_expr(str(col._jc))
        case str():
            over_positions = list(re.finditer(r' OVER ', col))
            l = []
            for i, _ in enumerate(over_positions):
                j = find_closing_parenthesis_idx(col[over_positions[i].end():over_positions[i+1].start() if over_positions[i+1:] else None])
                l.append(col[over_positions[i].start():over_positions[i].end() + j + 1])
            return l


def row_ids_column(df: DataFrame) -> str:
    return f"__row_ids_{id(df)}"


import functools

def get_aggregation_column(row_ids_col: ColumnOrName) -> Column:
    return F.array_distinct(
        F.flatten(
            F.collect_list(row_ids_col)
        )
    )


def get_aggregation_expr(row_ids_col: ColumnOrName) -> str:
    return str(get_aggregation_column(row_ids_col)._jc)

def aggregate_row_ids(col: ColumnOrName, row_ids_col: str):
        return F.expr(
            functools.reduce(
                lambda col, partition: f"({get_aggregation_expr(col)}){partition}",
                find_partitions_expr(col),
                row_ids_col
            )
        ).alias(row_ids_col)

from pprint import pprint

pprint(find_partitions_expr(col_expr))

print(aggregate_row_ids(col_expr, "toto"))

[' OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN '
 'UNBOUNDED PRECEDING AND CURRENT ROW)',
 ' OVER (PARTITION BY tutu)']
Column<'(array_distinct(flatten(collect_list((array_distinct(flatten(collect_list(toto)))) OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW))))) OVER (PARTITION BY tutu) AS toto'>


In [9]:
F.expr("toto")

Column<'toto'>

In [97]:
F.expr(str(F.array_distinct(
            F.flatten(
                F.collect_list("toto")
            )
        )._jc) + ' OVER (PARTITION BY tutu)').alias("toto")

Column<'array_distinct(flatten(collect_list(toto))) OVER (PARTITION BY tutu) AS toto'>

In [61]:
list(re.finditer(r'\bOVER\b', col_expr, re.IGNORECASE))[-1]

<re.Match object; span=(122, 126), match='OVER'>

In [54]:
re.sub(
    r"\b(\w+)\b(?=\sOVER\s\(.+\))",
    "a", 
    "sum(toto) OVER (PARTITION BY titi ORDER BY titi)"
)

'sum(toto) OVER (PARTITION BY titi ORDER BY titi)'

In [46]:
col_expr.split("OVER")

['sum(row_number() ',
 ' (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) ',
 ' (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)']

In [39]:
F.row_number().over(Window.orderBy("toto"))._jc.toString()

'row_number() OVER (ORDER BY toto ASC NULLS FIRST)'

In [38]:
F.array_distinct(F.flatten(F.collect_list("toto"))).cast("integer")._jc.toString()

'CAST(array_distinct(flatten(collect_list(toto))) AS INT)'

In [34]:
data = [
    ("A", ["apple", "banana"]),
    ("B", ["orange", "grape"]),
    ("A", ["apple", "cherry"]),
    ("C", ["kiwi", "melon"]),
    ("B", ["grape", "orange"])
]
df = spark.createDataFrame(data, ["col2", "col1"])

# 2. Group by 'col2' and apply the aggregation
#    - `flatten` merges the sub-arrays into a single array.
#    - `array_distinct` removes duplicates.
result_df = df.groupBy("col2").agg(
    F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")
)
result_df.toPandas()

Unnamed: 0,col2,merged_col1
0,A,"[apple, banana, cherry]"
1,B,"[orange, grape]"
2,C,"[kiwi, melon]"


In [35]:
df.select(F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")).toPandas()

Unnamed: 0,merged_col1
0,"[apple, banana, orange, grape, cherry, kiwi, m..."


In [20]:
data = [
    ("A", 2),
    ("B", 3),
    ("A", 4),
    ("C", 5),
    ("B", 6)
]
df = spark.createDataFrame(data, ["col1", "col2"])

result_df = df.select(
    "col1",
    F.sum("col2").over(Window.partitionBy("col1")).alias("col2")
)
result_df.toPandas()

Unnamed: 0,col1,col2
0,A,6
1,A,6
2,B,9
3,B,9
4,C,5


In [32]:
df.select(F.sum("col2")).toPandas()

Unnamed: 0,sum(col2)
0,20


In [101]:
import re

def replace_word_before_over(original_string, replacement):
    """
    Replaces the first word in a string of the form "word OVER (another_word)".

    Args:
        original_string (str): The string to modify.
        replacement (str): The word to replace the first word with.

    Returns:
        str: The new string with the word replaced.
    """
    # The regex pattern:
    # \b(\w+)\b : Matches and captures a word (group 1)
    # (?=\sOVER\s\(.+\)) : Positive lookahead to ensure it's followed by " OVER (some_text)"
    pattern = r"\b(\w+)\b(?=\sOVER\s\(.+\))"

    # Use re.sub to replace the captured word
    return re.sub(pattern, replacement, original_string)

# --- Examples ---

# Example 1: Original problem
text1 = "toto OVER (titi)"
new_text1 = replace_word_before_over(text1, "new_word")
print(f"Original: '{text1}' -> New: '{new_text1}'")

# Example 2: Different words
text2 = "amount_column OVER (partition_col)"
new_text2 = replace_word_before_over(text2, "sum_of_amount")
print(f"Original: '{text2}' -> New: '{new_text2}'")

# Example 3: Edge case - the word appears elsewhere
text3 = "my_column + 1, my_column OVER (some_other_column)"
new_text3 = replace_word_before_over(text3, "sum_of_my_column")
print(f"Original: '{text3}' -> New: '{new_text3}'")

Original: 'toto OVER (titi)' -> New: 'new_word OVER (titi)'
Original: 'amount_column OVER (partition_col)' -> New: 'sum_of_amount OVER (partition_col)'
Original: 'my_column + 1, my_column OVER (some_other_column)' -> New: 'my_column + 1, sum_of_my_column OVER (some_other_column)'
