In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable 
from pyspark.sql.window import Window

In [0]:
%run /Workspace/Users/eshivee@gmail.com/Atlikon_SportsBar_Data_Pipeline/Set_Up/utilities

In [0]:
dbutils.widgets.text('catalog', 'sportats', 'Catalog')
catalog = dbutils.widgets.get('catalog')
dbutils.widgets.text('data_source', 'gross_price', 'Data Source')
data_source = dbutils.widgets.get('data_source')

basepath = f's3://sportsbarsa/{data_source}/*.csv'
print(basepath)

In [0]:
df = spark.read.format('csv')\
    .option('header', True)\
    .option('inferSchema', True)\
    .load(f'{basepath}')\
    .withColumn('read_timestamp', current_timestamp())\
    .select('*' , '_metadata.file_name', '_metadata.file_size')

In [0]:
df.write.mode('overwrite')\
    .format('delta')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{bronze_schema}.{data_source}')

In [0]:
f'{catalog}.{bronze_schema}.{data_source}'

**SILVER LAYER TABLE GROSS_PRICE**

In [0]:
df_bronze = spark.read.table(f'{catalog}.{bronze_schema}.{data_source}')
df_bronze.limit(5).display()

In [0]:
#convert to date format, taking account of the various date formats available in each row.
#date formats availablle in the column include yyyy/MM/dd, dd/MM/yyyy, yyyy-MM-dd, dd-MM-yyyy
df_silver = df_bronze.withColumn('month',
                                 coalesce(
                                     try_to_date(col('month'), 'yyyy/MM/dd'),
                                     try_to_date(col('month'), 'dd/MM/yyyy'),
                                     try_to_date(col('month'), 'yyyy-MM-dd'),
                                     try_to_date(col('month'), 'dd-MM-yyyy')
                                 ))
                
df_silver.select('month').display()

In [0]:
df_silver = df_silver.withColumn('gross_price',
                                 when(col("gross_price").rlike(r'^-?\d+(\.\d+)?$'),
                                        when(col("gross_price") < 0,  -1 * col('gross_price').cast('double'))\
                                            .otherwise(col('gross_price').cast('double')))
                                .otherwise(0)
                                 )


In [0]:
df_silver.printSchema()

In [0]:
df_products = spark.read.table('sportats.silver.products')
df_joined = df_silver.join(df_products.select('product_id', 'product_code'), df_silver.product_id == df_products.product_id, how = 'inner')
df_joined = df_joined.select('sportats.bronze.gross_price.product_id', 'product_code', 'month', 'gross_price', 'read_timestamp', 'file_name', 'file_size')

df_joined.display()

In [0]:
df_joined.write.format('delta')\
    .mode('overwrite')\
    .option('mergeSchema', 'true')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{silver_schema}.{data_source}')

**GOLD LAYER GROSS PRICES TABLE**

In [0]:
df_silver = spark.read.table(f'{catalog}.{silver_schema}.{data_source}')

In [0]:
df_gold = df_silver.select('product_code', 'month', 'gross_price')
df_gold.limit(5).display()

In [0]:
df_gold.write\
    .format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{gold_schema}.sb_dim_{data_source}')

In [0]:
df_gold_child = df_gold.withColumn('year', year('month'))\
                        .withColumn('m', month('month'))\
                        .withColumn('dense_rank', dense_rank().over(Window.partitionBy('product_code', 'year').orderBy(col('m').desc())))\
                        .filter(col('dense_rank')==1)\
                        .select('product_code', 'gross_price', 'year')

In [0]:
df_gold_child = df_gold_child.withColumnRenamed('gross_price', 'price_inr')

In [0]:
df_gold_child = df_gold_child.withColumn('price_inr', col('price_inr').cast('int'))\
            .withColumn('year', col('year').cast('int'))

In [0]:
old_df = spark.read.table('sportats.gold.dim_gross_price')

In [0]:
print('rows before merge :', old_df.count())


delta_obj = DeltaTable.forName(spark, 'sportats.gold.dim_gross_price')

delta_obj.alias('trg').merge(df_gold_child.alias('src'), 'trg.product_code = src.product_code')\
    .whenMatchedUpdate(
        set = {
            'price_inr' : 'src.price_inr',
            'year' : 'src.year'
        }
    )\
    .whenNotMatchedInsert(
        values = {
            'product_code' : 'src.product_code',
            'price_inr' : 'src.price_inr',
            'year' : 'src.year'
        }
    )\
    .execute()

new_df = spark.read.table('sportats.gold.dim_gross_price')
print('rows after merge:', new_df.count())

In [0]:
spark.read.table('sportats.gold.dim_gross_price').display()