In [1]:
import os
import sys
import delta
from pyspark.sql.window import Window
import pyspark.sql.functions as psf
import pandas as pd
import pyspark.sql
from pyspark.sql.types import StructType, StructField, DateType, \
    StringType, FloatType, TimestampType

sys.path.insert(0, os.path.abspath('./source/Producto_DIA_DP'))

from SparkDBUtils import SparkDB
sparkdb = SparkDB()
spark = sparkdb.spark

In [26]:
spark.sql("show tables in analisis_precios").show(truncate=False)

+----------------+------------------------+-----------+
|namespace       |tableName               |isTemporary|
+----------------+------------------------+-----------+
|analisis_precios|date_dim                |false      |
|analisis_precios|precio_dia_agg_norm_fact|false      |
|analisis_precios|producto_dia_fact       |false      |
|analisis_precios|producto_dim            |false      |
|analisis_precios|sequences_cfg           |false      |
+----------------+------------------------+-----------+



In [27]:
df = spark.table("analisis_precios.producto_dia_fact").toPandas()

  series = series.astype(t, copy=False)


In [44]:
df.info()
df[df.id_producto == 18013]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253146 entries, 0 to 253145
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   id_producto  253146 non-null  int32         
 1   id_date      253146 non-null  int32         
 2   price        252719 non-null  float64       
 3   unit_price   252719 non-null  float64       
 4   discount     0 non-null       float64       
 5   year         253146 non-null  int32         
 6   ts_load      253146 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int32(3)
memory usage: 10.6 MB


Unnamed: 0,id_producto,id_date,price,unit_price,discount,year,ts_load


In [42]:
 simple_schema = StructType([
        StructField("date", DateType(), True),
        StructField("product", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("price", FloatType(), True),
        StructField("categories", StringType(), True),
        StructField("unit_price", FloatType(), True),
        StructField("units", StringType(), True),
        StructField("discount", FloatType(), True),
        StructField("ts_load", TimestampType(), True)
    ])
    
dataset = spark.read.option("delimiter", ";") \
            .csv("../../dataset/dataset.csv", schema=simple_schema, header=True)

In [104]:
dt = dataset.where("product_id == 18013")
dt.show(50)

+----------+--------------------+----------+---------+-----+--------------------+----------+--------+--------+-------+
|      date|             product|product_id|    brand|price|          categories|unit_price|   units|discount|ts_load|
+----------+--------------------+----------+---------+-----+--------------------+----------+--------+--------+-------+
|2022-11-21|CARBONELL aceite ...|     18013|CARBONELL| 5.75|['Despensa', 'Ace...|      7.67|    €/l.|    null|   null|
|2022-11-23|CARBONELL aceite ...|     18013|CARBONELL| 5.75|['Despensa', 'Ace...|      7.67|    €/l.|    null|   null|
|2022-11-24|CARBONELL aceite ...|     18013|CARBONELL| 5.75|['Despensa', 'Ace...|      7.67|    €/l.|    null|   null|
|2022-11-25|CARBONELL aceite ...|     18013|CARBONELL| 5.75|['Despensa', 'Ace...|      7.67|    €/l.|    null|   null|
|2022-11-26|CARBONELL aceite ...|     18013|CARBONELL| 5.75|['Despensa', 'Ace...|      7.67|    €/l.|    null|   null|
|2022-11-27|CARBONELL aceite ...|     18013|CARB

In [76]:

@psf.pandas_udf(StringType())
def split_categoria(categorie_col: pd.Series) -> pd.Series:
    """
    Extrae el primer elemento de la lista de categorias
    """

    salida = categorie_col.apply(lambda x: eval(x)[0])

    return salida
    
# Ventana para obtener la ultima version de cada producto
window_spec = Window \
    .partitionBy("product_id") \
    .orderBy(psf.col("date").desc())

# Nos quedamos con la ultimaer version de cada producto en el dataset,
# ya que se repiten en todas las fechas
dataset_ultimos = dt \
    .withColumn("row_number", psf.row_number().over(window_spec)) \
    .where("row_number = 1") \
    .withColumn("categoria", split_categoria(dt.categories)) \
    .select(["product_id",
             "product",
             "units",
             "brand",
             "categories",
             "categoria",
             "date"])

 # Cargo los registros en base de datos
db = spark.table("analisis_precios.producto_dim").alias("db")

db.show(10)

+-----------+--------------------+-------+--------------------+----------+----------+--------------------+----------+--------------------+
|id_producto|             product|  brand|          categories|product_id|      date|           categoria|     units|             ts_load|
+-----------+--------------------+-------+--------------------+----------+----------+--------------------+----------+--------------------+
|        377|Leche semidesnata...|Pascual|['Leche. huevos y...|    119675|2023-08-23|Leche. huevos y m...|  €/LITRO)|2023-08-29 17:25:...|
|        394|Café en grano nat...| Toscaf|['Café. cacao e i...|    120166|2023-08-23|Café. cacao e inf...|   €/KILO)|2023-08-29 17:25:...|
|        443|Bálsamo after shave |  Nivea|['Perfumería. hig...|    124014|2023-07-30|Perfumería. higie...|€/100 ML.)|2023-08-29 17:25:...|
|        525|Alimento para gat...|     As|['Mascotas', 'Gat...|    128357|2023-08-27|            Mascotas|   €/KILO)|2023-08-29 17:25:...|
|        542|Sardinas en ac

In [81]:
db.where("product_id = 18013").show()

+-----------+--------------------+---------+--------------------+----------+----------+--------------------+--------+--------------------+
|id_producto|             product|    brand|          categories|product_id|      date|           categoria|   units|             ts_load|
+-----------+--------------------+---------+--------------------+----------+----------+--------------------+--------+--------------------+
|       1839|Aceite de oliva v...|Carbonell|['Aceites. salsas...|     18013|2023-07-30|Aceites. salsas y...|€/LITRO)|2023-08-29 17:25:...|
+-----------+--------------------+---------+--------------------+----------+----------+--------------------+--------+--------------------+



In [83]:
 # Obtengo los del dataset que si que están en base de datos
estan = dataset_ultimos.join(db.select(["product_id", "id_producto"]), "product_id", "inner")

estan.show()

+----------+--------------------+--------+---------+--------------------+--------------------+----------+-----------+
|product_id|             product|   units|    brand|          categories|           categoria|      date|id_producto|
+----------+--------------------+--------+---------+--------------------+--------------------+----------+-----------+
|     18013|Aceite de oliva v...|€/LITRO)|Carbonell|['Aceites. salsas...|Aceites. salsas y...|2023-07-30|       1839|
+----------+--------------------+--------+---------+--------------------+--------------------+----------+-----------+



In [97]:
producto_dia_fact = spark.table("analisis_precios.producto_dia_fact")
date_dim = spark.table("analisis_precios.date_dim")
producto_dim = spark.table("analisis_precios.producto_dim").where("product_id = 18013")

date_dim.show(1)

+-------+----------+----+--------------------+
|id_date|      date|year|             ts_load|
+-------+----------+----+--------------------+
|      1|2022-11-21|2022|2023-08-27 17:48:...|
+-------+----------+----+--------------------+
only showing top 1 row



In [102]:
producto_dia_fact\
    .join(producto_dim, "id_producto", "inner")\
    .join(date_dim.alias("dd"), "id_date", "inner")\
    .select(["dd.date"])\
    .orderBy(F.col("dd.date").desc(), descending=True)\
    .show()

+----------+
|      date|
+----------+
|2023-07-30|
|2023-03-11|
|2023-02-28|
|2023-02-20|
|2023-02-11|
|2023-02-10|
|2023-02-08|
|2023-01-25|
|2023-01-23|
|2023-01-20|
|2023-01-19|
|2023-01-18|
|2023-01-16|
|2023-01-13|
|2023-01-12|
|2023-01-07|
|2023-01-06|
|2022-12-31|
|2022-12-29|
|2022-12-28|
+----------+
only showing top 20 rows



In [87]:
producto_dia_fact.where("id_producto = 1839")\
    .orderBy("")
    .show()

+-----------+-------+-----------------+-----------------+--------+----+--------------------+
|id_producto|id_date|            price|       unit_price|discount|year|             ts_load|
+-----------+-------+-----------------+-----------------+--------+----+--------------------+
|       1839|      4|             5.75|7.670000076293945|    null|2022|2023-08-27 17:49:...|
|       1839|      6|             5.75|7.670000076293945|    null|2022|2023-08-27 17:49:...|
|       1839|     22|7.050000190734863|9.399999618530273|    null|2022|2023-08-27 17:49:...|
|       1839|     15|7.050000190734863|9.399999618530273|    null|2022|2023-08-27 17:49:...|
|       1839|     13|7.050000190734863|9.399999618530273|    null|2022|2023-08-27 17:49:...|
|       1839|     16|7.050000190734863|9.399999618530273|    null|2022|2023-08-27 17:49:...|
|       1839|     23|7.050000190734863|9.399999618530273|    null|2022|2023-08-27 17:49:...|
|       1839|     19|7.050000190734863|9.399999618530273|    null|2022

# DELETE

In [2]:
dt = delta.DeltaTable.forName(spark, "producto_dia.staging_product")

In [4]:
dt.delete(psf.column("id_producto") != "18013")

In [5]:
dt.toDF().show()

+-----------+--------------------+-----+
|id_producto|         url_product|index|
+-----------+--------------------+-----+
|      18013|https://www.dia.e...| 1747|
+-----------+--------------------+-----+



{:E)

  /==o  /---/
{:E)===|
  \==o  \---\