In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    BooleanType,
)

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/20 09:24:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.createDataFrame(
    [
        (348272371, "2023-01-01"),
        (348272371, "2023-01-02"),
        (348272371, "2023-01-03"),
        (348272371, "2023-01-04"),
        (348272371, "2023-01-05"),
        (348272371, "2023-01-06"),
        (348272371, "2023-01-07"),
    ],
    schema=[
        "ID_BIC_CLIENTE",
        "DATA_TRANSAZIONE",
    ],
)

df = df.withColumn(
    "DATA_TRANSAZIONE", F.to_timestamp(F.col("DATA_TRANSAZIONE"), "yyyy-MM-dd")
)

In [4]:
df.show()

                                                                                

+--------------+-------------------+
|ID_BIC_CLIENTE|   DATA_TRANSAZIONE|
+--------------+-------------------+
|     348272371|2023-01-01 00:00:00|
|     348272371|2023-01-02 00:00:00|
|     348272371|2023-01-03 00:00:00|
|     348272371|2023-01-04 00:00:00|
|     348272371|2023-01-05 00:00:00|
|     348272371|2023-01-06 00:00:00|
|     348272371|2023-01-07 00:00:00|
+--------------+-------------------+



In [5]:
for row in df.select("DATA_TRANSAZIONE").collect():
    print(type(row.DATA_TRANSAZIONE))

<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>
<class 'datetime.datetime'>


In [6]:
datetime_rows = df.select("DATA_TRANSAZIONE").collect()
datetimes = [datetime_row.DATA_TRANSAZIONE for datetime_row in datetime_rows]
datetimes

[datetime.datetime(2023, 1, 1, 0, 0),
 datetime.datetime(2023, 1, 2, 0, 0),
 datetime.datetime(2023, 1, 3, 0, 0),
 datetime.datetime(2023, 1, 4, 0, 0),
 datetime.datetime(2023, 1, 5, 0, 0),
 datetime.datetime(2023, 1, 6, 0, 0),
 datetime.datetime(2023, 1, 7, 0, 0)]

In [7]:
datetime_couples = [(datetime_1, datetime_2) for datetime_1, datetime_2 in zip(datetimes[::], datetimes[1::])]
datetime_couples

[(datetime.datetime(2023, 1, 1, 0, 0), datetime.datetime(2023, 1, 2, 0, 0)),
 (datetime.datetime(2023, 1, 2, 0, 0), datetime.datetime(2023, 1, 3, 0, 0)),
 (datetime.datetime(2023, 1, 3, 0, 0), datetime.datetime(2023, 1, 4, 0, 0)),
 (datetime.datetime(2023, 1, 4, 0, 0), datetime.datetime(2023, 1, 5, 0, 0)),
 (datetime.datetime(2023, 1, 5, 0, 0), datetime.datetime(2023, 1, 6, 0, 0)),
 (datetime.datetime(2023, 1, 6, 0, 0), datetime.datetime(2023, 1, 7, 0, 0))]

In [8]:
for datetime_couple in datetime_couples:
    print(datetime_couple[1] - datetime_couple[0])

1 day, 0:00:00
1 day, 0:00:00
1 day, 0:00:00
1 day, 0:00:00
1 day, 0:00:00
1 day, 0:00:00


In [9]:
def engine(pdf):
    print(pdf.len())

df.groupby("ID_BIC_CLIENTE").applyInPandas(engine, schema="ID_BIC_CLIENTE long, DATA_TRANSAZIONE date")

DataFrame[ID_BIC_CLIENTE: bigint, DATA_TRANSAZIONE: date]

23/07/20 09:24:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [25]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Creazione della SparkSession
spark = SparkSession.builder \
    .appName("CheckDateIntervals") \
    .getOrCreate()

# Creazione del DataFrame con dati casuali
data = [
    (1, "2023-07-20 12:00:00"),
    (1, "2023-07-22 12:00:00"),
    (1, "2023-07-21 12:00:00"),
    (2, "2023-07-22 12:00:00"),
    (2, "2023-07-21 12:00:00"),
    # Aggiungi altre righe per testare diversi scenari
]

schema = ["id", "date"]
df = spark.createDataFrame(data, schema)
df = df.withColumn("date", F.col("date").cast("timestamp"))

df = df.withColumn("timestamp_unix", F.unix_timestamp('date'))

# Calcolo della differenza tra le date per ogni "id"
window_spec = Window.partitionBy("id").orderBy("date")

df = df.withColumn("date_diff", F.col("timestamp_unix") - F.lag(F.col("timestamp_unix"), 1).over(window_spec))

df.show()

# Verifica della distanza di 24 ore tra le date
invalid_intervals = df.filter((F.col("date_diff") != 24*60*60) & F.col("date_diff").isNotNull())

# Stampa dei risultati
if invalid_intervals.count() > 0:
    print("Ci sono intervalli non validi per le date.")
    invalid_intervals.show()
else:
    print("Tutti gli intervalli di date sono validi (24 ore di differenza).")

+---+-------------------+----------+---------+
| id|               date|      unix|date_diff|
+---+-------------------+----------+---------+
|  1|2023-07-20 12:00:00|1689847200|     null|
|  1|2023-07-21 12:00:00|1689933600|    86400|
|  1|2023-07-22 12:00:00|1690020000|    86400|
|  2|2023-07-21 12:00:00|1689933600|     null|
|  2|2023-07-22 12:00:00|1690020000|    86400|
+---+-------------------+----------+---------+

Tutti gli intervalli di date sono validi (24 ore di differenza).


In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Creazione della SparkSession
spark = SparkSession.builder \
    .appName("CheckDateIntervals") \
    .getOrCreate()

# Creazione del DataFrame con dati casuali
data = [
    (1, "2023-03-25"),
    (1, "2023-03-26"),
    (1, "2023-03-27"),
    (2, "2023-03-26"),
    (2, "2023-03-27"),
    # Aggiungi altre righe per testare diversi scenari
]

schema = ["id", "date"]
df = spark.createDataFrame(data, schema)
df = df.withColumn("date", F.col("date").cast("timestamp"))

df = df.withColumn("timestamp_unix", F.unix_timestamp('date'))

# Calcolo della differenza tra le date per ogni "id"
window_spec = Window.partitionBy("id").orderBy("date")

df = df.withColumn("date_diff", F.col("timestamp_unix") - F.lag(F.col("timestamp_unix"), 1).over(window_spec))

df.show()

# Verifica della distanza di 24 ore tra le date
invalid_intervals = df.filter((F.col("date_diff") != 24*60*60) & F.col("date_diff").isNotNull())

# Stampa dei risultati
if invalid_intervals.count() > 0:
    print("Ci sono intervalli non validi per le date.")
    invalid_intervals.show()
else:
    print("Tutti gli intervalli di date sono validi (24 ore di differenza).")

+---+-------------------+--------------+---------+
| id|               date|timestamp_unix|date_diff|
+---+-------------------+--------------+---------+
|  1|2023-03-25 00:00:00|    1679698800|     null|
|  1|2023-03-26 00:00:00|    1679785200|    86400|
|  1|2023-03-27 00:00:00|    1679868000|    82800|
|  2|2023-03-26 00:00:00|    1679785200|     null|
|  2|2023-03-27 00:00:00|    1679868000|    82800|
+---+-------------------+--------------+---------+

Ci sono intervalli non validi per le date.
+---+-------------------+--------------+---------+
| id|               date|timestamp_unix|date_diff|
+---+-------------------+--------------+---------+
|  1|2023-03-27 00:00:00|    1679868000|    82800|
|  2|2023-03-27 00:00:00|    1679868000|    82800|
+---+-------------------+--------------+---------+

