In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('./source/etl'))

from pyspark.sql import SparkSession
from SparkDBUtils import SparkDB
from delta import DeltaTable
import datetime as dt
import pyspark.sql
import pyspark.sql.functions as f
from pyspark.sql.types import DateType, StructType, StructField, IntegerType, TimestampType, StringType, FloatType
from pyspark.sql.window import Window

sparkdb = SparkDB()
spark = sparkdb.spark

In [27]:
simple_schema = StructType([
        StructField("date", DateType(), True),
        StructField("product", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("price", FloatType(), True),
        StructField("categories", StringType(), True),
        StructField("unit_price", FloatType(), True),
        StructField("units", StringType(), True),
        StructField("discount", FloatType(), True),
        StructField("ts_load", TimestampType(), True)
    ])

dataset = spark.read.option("delimiter", ";") \
    .csv("../../dataset/dataset.csv", schema=simple_schema, header=True)

In [28]:
year = 2023

# tabkla de productos para hacer el lookup
product_dim_db = sparkDB.read_table("producto_dim").select(["product_id", "id_producto"])

# tabla de fechas para hacer el lookup
date_dim_db = sparkDB.read_table("date_dim").select(["date", "id_date", "year"])

# Cogemos los hechos del dataset de la particion
producto_dia_fact_new = dataset\
            .join(product_dim_db.alias("p"), "product_id", "left")\
            .join(date_dim_db.alias("d"), "date", "left")\
            .select(["price",
                     "unit_price",
                     "discount",
                     "id_producto",
                     "id_date",
                     "d.year"])\
            .where(f"year = {year}")

In [7]:
p = DeltaTable.forName(spark, "producto_dia_fact")

In [15]:
df = p.toDF()

In [20]:
df.count()

242394

In [19]:
df.where("year = 2023").count()

93939

In [21]:
p.delete("year = 2023")

In [22]:
df.count()

148455

In [23]:
df.where("year = 2023").count()

0

In [24]:
producto_dia_fact_new.count()

93939

In [32]:
producto_dia_fact_new.show(10)

+-----+----------+--------+-----------+-------+----+
|price|unit_price|discount|id_producto|id_date|year|
+-----+----------+--------+-----------+-------+----+
| 1.15|      1.92|    null|       4675|     26|2023|
|12.59|      0.47|    null|       3587|     26|2023|
| 1.05|      21.0|    null|       1553|     26|2023|
| 1.19|      14.0|    null|       5189|     26|2023|
| 1.51|      1.91|    null|       6819|     26|2023|
| 2.95|     14.75|    null|        438|     26|2023|
| 1.25|      1.25|    null|       5131|     26|2023|
| 2.39|      4.78|    null|       6627|     26|2023|
| 2.79|      0.93|    null|       2850|     26|2023|
| 1.89|      7.27|    null|        957|     26|2023|
+-----+----------+--------+-----------+-------+----+
only showing top 10 rows



In [35]:
p.toDF().show(10)

+-----------+-------+-----------------+------------------+--------+----+--------------------+
|id_producto|id_date|            price|        unit_price|discount|year|             ts_load|
+-----------+-------+-----------------+------------------+--------+----+--------------------+
|        928|      1|4.090000152587891| 2.049999952316284|    null|2022|2023-03-20 19:24:...|
|       3185|      1|1.350000023841858| 1.350000023841858|    null|2022|2023-03-20 19:24:...|
|       6392|      1|5.789999961853027|  7.71999979019165|    null|2022|2023-03-20 19:24:...|
|       4919|      1|             2.25|13.239999771118164|    null|2022|2023-03-20 19:24:...|
|       1529|      1|             1.25| 6.940000057220459|    null|2022|2023-03-20 19:24:...|
|       6455|      1|1.690000057220459| 1.690000057220459|    null|2022|2023-03-20 19:24:...|
|       4470|      1|1.590000033378601|10.600000381469727|    null|2022|2023-03-20 19:24:...|
|       5637|      1|7.789999961853027|12.979999542236328|  

In [36]:
p.toDF().printSchema()

root
 |-- id_producto: integer (nullable = true)
 |-- id_date: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- ts_load: timestamp (nullable = true)



In [25]:
producto_dia_fact_new.write.format("delta").saveAsTable("producto_dia_fact", mode="append")

AnalysisException: Failed to merge fields 'price' and 'price'. Failed to merge incompatible data types DoubleType and FloatType