In [1]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("PySpark Experimentation") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/17 16:15:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_df = spark.createDataFrame([
    (1, "2023-06-01", 5, "option_1"),
    (1, "2023-06-01", 6, "option_2"),
    (1, "2023-06-02", 7, "option_1"),
    (1, "2023-06-06", 4, "option_1"),
    (2, "2023-06-03", 10, "option_2"),
    (2, "2023-06-03", 13, "option_2"),
], schema=['id', 'timestamp', 'amount', "categorical_feature"])

data_df = data_df.withColumn("timestamp", F.to_timestamp(F.col("timestamp"), "yyyy-MM-dd"))

In [4]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- amount: long (nullable = true)
 |-- categorical_feature: string (nullable = true)



True

In [5]:
data_df.show()

                                                                                

+---+-------------------+------+-------------------+
| id|          timestamp|amount|categorical_feature|
+---+-------------------+------+-------------------+
|  1|2023-06-01 00:00:00|     5|           option_1|
|  1|2023-06-01 00:00:00|     6|           option_2|
|  1|2023-06-02 00:00:00|     7|           option_1|
|  1|2023-06-06 00:00:00|     4|           option_1|
|  2|2023-06-03 00:00:00|    10|           option_2|
|  2|2023-06-03 00:00:00|    13|           option_2|
+---+-------------------+------+-------------------+



23/07/04 10:53:49 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [5]:
utf_shift_hours = 2
windows_size = "2 days"

In [6]:
window_column = F.window(timeColumn=F.col("timestamp"), windowDuration=windows_size, startTime=f"-{utf_shift_hours} hours")

In [7]:
data_df = data_df.withColumn("window", window_column)

In [8]:
data_df.show(truncate=False)

                                                                                

+---+-------------------+------+-------------------+------------------------------------------+
|id |timestamp          |amount|categorical_feature|window                                    |
+---+-------------------+------+-------------------+------------------------------------------+
|1  |2023-06-01 00:00:00|5     |option_1           |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-01 00:00:00|6     |option_2           |{2023-05-31 00:00:00, 2023-06-02 00:00:00}|
|1  |2023-06-02 00:00:00|7     |option_1           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|1  |2023-06-06 00:00:00|4     |option_1           |{2023-06-06 00:00:00, 2023-06-08 00:00:00}|
|2  |2023-06-03 00:00:00|10    |option_2           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
|2  |2023-06-03 00:00:00|13    |option_2           |{2023-06-02 00:00:00, 2023-06-04 00:00:00}|
+---+-------------------+------+-------------------+------------------------------------------+



In [9]:
cat_grouped_df = data_df.groupBy(F.col("id"), window_column).pivot("categorical_feature").count()
cat_grouped_df.show()

23/07/17 16:15:30 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+---+--------------------+--------+--------+
| id|              window|option_1|option_2|
+---+--------------------+--------+--------+
|  1|{2023-05-31 00:00...|       1|       1|
|  1|{2023-06-06 00:00...|       1|    null|
|  2|{2023-06-02 00:00...|    null|       2|
|  1|{2023-06-02 00:00...|       1|    null|
+---+--------------------+--------+--------+



In [10]:
num_grouped_df = data_df.groupBy(F.col("id"), window_column).agg(F.sum("amount").alias("tot"))
num_grouped_df.show()

+---+--------------------+---+
| id|              window|tot|
+---+--------------------+---+
|  1|{2023-05-31 00:00...| 11|
|  1|{2023-06-02 00:00...|  7|
|  1|{2023-06-06 00:00...|  4|
|  2|{2023-06-02 00:00...| 23|
+---+--------------------+---+



In [11]:
grouped_df = cat_grouped_df.join(num_grouped_df, on=["id", "window"])
grouped_df.show()

+---+--------------------+--------+--------+---+
| id|              window|option_1|option_2|tot|
+---+--------------------+--------+--------+---+
|  1|{2023-05-31 00:00...|       1|       1| 11|
|  1|{2023-06-02 00:00...|       1|    null|  7|
|  1|{2023-06-06 00:00...|       1|    null|  4|
|  2|{2023-06-02 00:00...|    null|       2| 23|
+---+--------------------+--------+--------+---+



In [12]:
grouped_df = grouped_df.withColumn("timestamp", F.col("window").start)
# grouped_df = grouped_df.withColumn("window_end", F.col("window").end)

grouped_df.show()

+---+--------------------+--------+--------+---+-------------------+
| id|              window|option_1|option_2|tot|          timestamp|
+---+--------------------+--------+--------+---+-------------------+
|  1|{2023-05-31 00:00...|       1|       1| 11|2023-05-31 00:00:00|
|  1|{2023-06-02 00:00...|       1|    null|  7|2023-06-02 00:00:00|
|  1|{2023-06-06 00:00...|       1|    null|  4|2023-06-06 00:00:00|
|  2|{2023-06-02 00:00...|    null|       2| 23|2023-06-02 00:00:00|
+---+--------------------+--------+--------+---+-------------------+



In [15]:
min_window_timestamp = grouped_df.select(F.min(F.col("timestamp"))).collect()[0].__getitem__('min(timestamp)')
max_window_timestamp = grouped_df.select(F.max(F.col("timestamp"))).collect()[0].__getitem__('max(timestamp)')

In [16]:
spark.sql(f"SELECT sequence(to_date('{min_window_timestamp}'), to_date('{max_window_timestamp}'), interval 2 days) as date").withColumn("date", F.explode(F.col("date"))).show()

+----------+
|      date|
+----------+
|2023-05-31|
|2023-06-02|
|2023-06-04|
|2023-06-06|
+----------+



In [17]:
timestamps_df = spark.sql(f"SELECT sequence(to_date('{min_window_timestamp}'), to_date('{max_window_timestamp}'), interval 2 days) as date").withColumn("date", F.explode(F.col("date")))
timestamps_df = timestamps_df.withColumn("timestamp", F.to_date(F.col("date"), "yyyy-MM-dd")).drop("date")

timestamps_df.show()

+----------+
| timestamp|
+----------+
|2023-05-31|
|2023-06-02|
|2023-06-04|
|2023-06-06|
+----------+



In [18]:
timestamps_df.printSchema()

root
 |-- timestamp: date (nullable = true)



In [19]:
ids_df = data_df.select(F.col("id")).distinct()
ids_timestamps_df = timestamps_df.join(ids_df, how="cross")

ids_timestamps_df.show()

+----------+---+
| timestamp| id|
+----------+---+
|2023-05-31|  1|
|2023-06-02|  1|
|2023-06-04|  1|
|2023-06-06|  1|
|2023-05-31|  2|
|2023-06-02|  2|
|2023-06-04|  2|
|2023-06-06|  2|
+----------+---+



In [20]:
group_on = (grouped_df.timestamp == ids_timestamps_df.timestamp) & (grouped_df.id == ids_timestamps_df.id)
grouped_df.join(ids_timestamps_df, on=["id", "timestamp"], how='right').fillna(0, subset=["tot", "option_1", "option_2"]).drop("window").show(truncate=False)

+---+----------+--------+--------+---+
|id |timestamp |option_1|option_2|tot|
+---+----------+--------+--------+---+
|1  |2023-05-31|1       |1       |11 |
|1  |2023-06-02|1       |0       |7  |
|1  |2023-06-04|0       |0       |0  |
|1  |2023-06-06|1       |0       |4  |
|2  |2023-05-31|0       |0       |0  |
|2  |2023-06-02|0       |2       |23 |
|2  |2023-06-04|0       |0       |0  |
|2  |2023-06-06|0       |0       |0  |
+---+----------+--------+--------+---+



In [None]:
spark.stop()

In [18]:
class StepProva():
    def process(self):
        print("ciao")

    def __call__(self):
        preprocess
        process
        posteprocess
        print("veloce")

stepProva = StepProva()
stepProva()

veloce


In [1]:
a = {
    "a": "ciao",
    "b": [1,2,3]
}

print(a)

{'a': 'ciao', 'b': [1, 2, 3]}


In [8]:
from abc import ABC, abstractmethod
class AbstractPipelineStep(ABC):

    @classmethod
    @abstractmethod
    def from_config(cls):
        pass

class Prova(AbstractPipelineStep):
    @classmethod
    def from_config(cls):
        print("ciao")

p = Prova()
p.from_config()

ciao


In [10]:
from pydantic import BaseModel
from typing import List
from enum import Enum


class Fruit(str, Enum):
    APPLE = 'apple'
    BANANA = 'banana'
    MELON = 'melon'


class UserForm(BaseModel):
    fruits: Fruit

user_form = UserForm(fruits="sapple")

ValidationError: 1 validation error for UserForm
fruits
  Input should be 'apple','banana' or 'melon' [type=enum, input_value='sapple', input_type=str]