In [None]:
from pyspark.sql import SparkSession
from datetime import date, datetime


spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/18 21:47:56 WARN Utils: Your hostname, FRL-1SFXW94, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/18 21:47:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/18 21:47:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/18 21:47:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [3]:
df.toPandas()

                                                                                

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [10]:
df.groupBy("a")._df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [103]:
from pyspark.sql.window import Window

w = Window.partitionBy("toto")

str(F.row_number().over(w).name())

F.row_number().over(w)._jc.toString()
# F.lit("a")

'row_number() OVER (PARTITION BY toto)'

In [24]:
import pyspark.sql.functions as F

data = [
    ("A", ["apple", "banana"]),
    ("B", ["orange", "grape"]),
    ("A", ["apple", "cherry"]),
    ("C", ["kiwi", "melon"]),
    ("B", ["grape", "orange"])
]
df = spark.createDataFrame(data, ["col2", "col1"])

# 2. Group by 'col2' and apply the aggregation
#    - `flatten` merges the sub-arrays into a single array.
#    - `array_distinct` removes duplicates.
result_df = df.groupBy("col2").agg(
    F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")
)
result_df.toPandas()

Unnamed: 0,col2,merged_col1
0,A,"[apple, banana, cherry]"
1,B,"[orange, grape]"
2,C,"[kiwi, melon]"


In [None]:
result_df = df.groupBy("col2").agg(
    F.array_distinct(
        F.flatten(
            F.collect_list("col1")
        )
    ).alias("merged_col1")
)
result_df.toPandas()

AnalysisException: [MISSING_AGGREGATION] The non-aggregating expression "col1" is based on columns which are not participating in the GROUP BY clause.
Add the columns or the expression to the GROUP BY, aggregate the expression, or use "any_value(col1)" if you do not care which of the values within a group is returned. SQLSTATE: 42803;
Aggregate [col2#11], [col2#11, array_distinct(col1#12) AS merged_col1#48]
+- LogicalRDD [col2#11, col1#12], false


In [101]:
import re

def replace_word_before_over(original_string, replacement):
    """
    Replaces the first word in a string of the form "word OVER (another_word)".

    Args:
        original_string (str): The string to modify.
        replacement (str): The word to replace the first word with.

    Returns:
        str: The new string with the word replaced.
    """
    # The regex pattern:
    # \b(\w+)\b : Matches and captures a word (group 1)
    # (?=\sOVER\s\(.+\)) : Positive lookahead to ensure it's followed by " OVER (some_text)"
    pattern = r"\b(\w+)\b(?=\sOVER\s\(.+\))"

    # Use re.sub to replace the captured word
    return re.sub(pattern, replacement, original_string)

# --- Examples ---

# Example 1: Original problem
text1 = "toto OVER (titi)"
new_text1 = replace_word_before_over(text1, "new_word")
print(f"Original: '{text1}' -> New: '{new_text1}'")

# Example 2: Different words
text2 = "amount_column OVER (partition_col)"
new_text2 = replace_word_before_over(text2, "sum_of_amount")
print(f"Original: '{text2}' -> New: '{new_text2}'")

# Example 3: Edge case - the word appears elsewhere
text3 = "my_column + 1, my_column OVER (some_other_column)"
new_text3 = replace_word_before_over(text3, "sum_of_my_column")
print(f"Original: '{text3}' -> New: '{new_text3}'")

Original: 'toto OVER (titi)' -> New: 'new_word OVER (titi)'
Original: 'amount_column OVER (partition_col)' -> New: 'sum_of_amount OVER (partition_col)'
Original: 'my_column + 1, my_column OVER (some_other_column)' -> New: 'my_column + 1, sum_of_my_column OVER (some_other_column)'
