In [1]:
from pyspark.sql import SparkSession
from datetime import date, datetime
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/19 13:37:16 WARN Utils: Your hostname, FRL-1SFXW94, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/19 13:37:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/19 13:37:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [2]:
df.toPandas()

                                                                                

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [3]:
df.groupBy("a")._df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [53]:
from pyspark.sql.window import Window
import re

w = Window.orderBy("titi").rowsBetween(Window.unboundedPreceding, Window.currentRow).partitionBy("toto")

str(F.row_number().over(w).name())

col_expr = F.sum(F.row_number().over(w)).over(w)._jc.toString()
print(col_expr)
# F.lit("a")


sum(row_number() OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) OVER (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)


In [None]:
def toto(s: str):
    if "OVER" in s:
        over_positions = list(re.finditer(r'\b OVER \b', s, re.IGNORECASE))
        last_over = over_positions[-1]
        s[last_over.start():]
        return 
    else:
        return "tutu()"

In [61]:
list(re.finditer(r'\bOVER\b', col_expr, re.IGNORECASE))[-1]

<re.Match object; span=(122, 126), match='OVER'>

In [54]:
re.sub(
    r"\b(\w+)\b(?=\sOVER\s\(.+\))",
    "a", 
    "sum(toto) OVER (PARTITION BY titi ORDER BY titi)"
)

'sum(toto) OVER (PARTITION BY titi ORDER BY titi)'

In [46]:
col_expr.split("OVER")

['sum(row_number() ',
 ' (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) ',
 ' (PARTITION BY toto ORDER BY titi ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)']

In [39]:
F.row_number().over(Window.orderBy("toto"))._jc.toString()

'row_number() OVER (ORDER BY toto ASC NULLS FIRST)'

In [38]:
F.array_distinct(F.flatten(F.collect_list("toto"))).cast("integer")._jc.toString()

'CAST(array_distinct(flatten(collect_list(toto))) AS INT)'

In [34]:
data = [
    ("A", ["apple", "banana"]),
    ("B", ["orange", "grape"]),
    ("A", ["apple", "cherry"]),
    ("C", ["kiwi", "melon"]),
    ("B", ["grape", "orange"])
]
df = spark.createDataFrame(data, ["col2", "col1"])

# 2. Group by 'col2' and apply the aggregation
#    - `flatten` merges the sub-arrays into a single array.
#    - `array_distinct` removes duplicates.
result_df = df.groupBy("col2").agg(
    F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")
)
result_df.toPandas()

Unnamed: 0,col2,merged_col1
0,A,"[apple, banana, cherry]"
1,B,"[orange, grape]"
2,C,"[kiwi, melon]"


In [35]:
df.select(F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")).toPandas()

Unnamed: 0,merged_col1
0,"[apple, banana, orange, grape, cherry, kiwi, m..."


In [20]:
data = [
    ("A", 2),
    ("B", 3),
    ("A", 4),
    ("C", 5),
    ("B", 6)
]
df = spark.createDataFrame(data, ["col1", "col2"])

result_df = df.select(
    "col1",
    F.sum("col2").over(Window.partitionBy("col1")).alias("col2")
)
result_df.toPandas()

Unnamed: 0,col1,col2
0,A,6
1,A,6
2,B,9
3,B,9
4,C,5


In [32]:
df.select(F.sum("col2")).toPandas()

Unnamed: 0,sum(col2)
0,20


In [101]:
import re

def replace_word_before_over(original_string, replacement):
    """
    Replaces the first word in a string of the form "word OVER (another_word)".

    Args:
        original_string (str): The string to modify.
        replacement (str): The word to replace the first word with.

    Returns:
        str: The new string with the word replaced.
    """
    # The regex pattern:
    # \b(\w+)\b : Matches and captures a word (group 1)
    # (?=\sOVER\s\(.+\)) : Positive lookahead to ensure it's followed by " OVER (some_text)"
    pattern = r"\b(\w+)\b(?=\sOVER\s\(.+\))"

    # Use re.sub to replace the captured word
    return re.sub(pattern, replacement, original_string)

# --- Examples ---

# Example 1: Original problem
text1 = "toto OVER (titi)"
new_text1 = replace_word_before_over(text1, "new_word")
print(f"Original: '{text1}' -> New: '{new_text1}'")

# Example 2: Different words
text2 = "amount_column OVER (partition_col)"
new_text2 = replace_word_before_over(text2, "sum_of_amount")
print(f"Original: '{text2}' -> New: '{new_text2}'")

# Example 3: Edge case - the word appears elsewhere
text3 = "my_column + 1, my_column OVER (some_other_column)"
new_text3 = replace_word_before_over(text3, "sum_of_my_column")
print(f"Original: '{text3}' -> New: '{new_text3}'")

Original: 'toto OVER (titi)' -> New: 'new_word OVER (titi)'
Original: 'amount_column OVER (partition_col)' -> New: 'sum_of_amount OVER (partition_col)'
Original: 'my_column + 1, my_column OVER (some_other_column)' -> New: 'my_column + 1, sum_of_my_column OVER (some_other_column)'
