In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('./source/etl'))

from pyspark.sql import SparkSession
from SparkDBUtils import SparkDB
import delta
import datetime as dt
import pyspark.sql
import pyspark.sql.functions as f
from pyspark.sql.types import DateType, StructType, StructField, IntegerType, TimestampType

sparkdb = SparkDB()
spark = sparkdb.spark

In [2]:
spark.sql("select * from date_dim").show()

+-------+----------+--------------------+
|id_date|      date|             ts_load|
+-------+----------+--------------------+
|      1|2022-11-21|2023-02-23 09:52:...|
|      2|2022-11-23|2023-02-23 09:52:...|
|      3|2022-11-24|2023-02-23 09:52:...|
|      4|2022-11-25|2023-02-23 09:52:...|
|      5|2022-11-26|2023-02-23 09:52:...|
|      6|2022-11-27|2023-02-23 09:52:...|
|      7|2022-11-28|2023-02-23 09:52:...|
|      8|2022-11-29|2023-02-23 09:52:...|
|      9|2022-12-01|2023-02-23 09:52:...|
|     10|2022-12-03|2023-02-23 09:52:...|
|     11|2022-12-06|2023-02-23 09:52:...|
|     12|2022-12-08|2023-02-23 09:52:...|
|     13|2022-12-10|2023-02-23 09:52:...|
|     14|2022-12-11|2023-02-23 09:52:...|
|     15|2022-12-12|2023-02-23 09:52:...|
|     16|2022-12-13|2023-02-23 09:52:...|
|     17|2022-12-19|2023-02-23 09:52:...|
|     18|2022-12-22|2023-02-23 09:52:...|
|     19|2022-12-23|2023-02-23 09:52:...|
|     20|2022-12-24|2023-02-23 09:52:...|
+-------+----------+--------------

In [3]:
spark.sql("select * from sequences_cfg").show()

+----------+---+--------------------+
|table_name| id|             ts_load|
+----------+---+--------------------+
|  date_dim| 39|2023-02-23 09:50:...|
+----------+---+--------------------+



In [5]:
spark.sql("delete from date_dim").show()

+-----------------+
|num_affected_rows|
+-----------------+
|               39|
+-----------------+



In [53]:
 def insert_id(df: pyspark.sql.dataframe, table_name: str) -> pyspark.sql.dataframe:

        # Ventana por cualquier columna, para poder usar row_number
        window_spec = pyspark.sql.window.Window \
            .orderBy(df_new.columns[0])

        # Obtenemos la ultima secuencia que se utilizó
        seq = sparkdb.read_last_seq(table_name)

        # Actualizamos la columna id con secuenciales desde la ultima secuencia
        df = df. \
            withColumn("id", f.row_number().over(window_spec) + seq)

        # Obtenemos la nueva ultima secuencia
        max_seq = df.pandas_api()["id"].max()
        
        # Actualizamos en la tabla de secuencias
        spark.sql(f"""
            update sequences_cfg set id={max_seq} 
            where table_name == '{table_name}'
            """)
        
        return df

In [9]:
schema = StructType([\
        StructField("id", IntegerType(), True),\
        StructField("date", DateType(), True),\
        StructField("ts_load", TimestampType(), True),\
        ])

df_new = sparkdb.spark.createDataFrame([
        (None, dt.datetime(2020, 5, 17), dt.datetime.now()),
        (None, dt.datetime(2020, 5, 25), dt.datetime.now())],
        schema=schema)

In [50]:
df_new = insert_id(df_new,"date_dim")

12


In [51]:
df_new.show()

+---+----------+--------------------+
| id|      date|             ts_load|
+---+----------+--------------------+
| 11|2020-05-17|2023-02-22 07:16:...|
| 12|2020-05-25|2023-02-22 07:16:...|
+---+----------+--------------------+



In [52]:
sparkdb.write_table(df_new, "date_dim", "append")

sparkdb.read_table("date_dim").show()

+---+----------+--------------------+
| id|      date|             ts_load|
+---+----------+--------------------+
|  7|2020-05-17|2023-02-22 07:16:...|
|  8|2020-05-25|2023-02-22 07:16:...|
| 11|2020-05-17|2023-02-22 07:16:...|
| 12|2020-05-25|2023-02-22 07:16:...|
|  5|2020-05-17|2023-02-22 07:16:...|
|  6|2020-05-25|2023-02-22 07:16:...|
|  9|2020-05-17|2023-02-22 07:16:...|
| 10|2020-05-25|2023-02-22 07:16:...|
+---+----------+--------------------+

