In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/eshivee@gmail.com/Atlikon_SportsBar_Data_Pipeline/Set_Up/utilities

In [0]:
dbutils.widgets.text('catalog', 'sportats', 'Catalog')
catalog = dbutils.widgets.get('catalog')
dbutils.widgets.text('data_source', 'products', 'Data Source')
data_source = dbutils.widgets.get('data_source')

basepath = f's3://sportsbarsa/{data_source}/*.csv'
print(basepath)

In [0]:
df = spark.read.format('csv')\
    .option('header', True)\
    .option('inferSchema', True)\
    .load(basepath)\
    .withColumn('read_timestamp', current_timestamp())\
    .select('*', '_metadata.file_name', '_metadata.file_size')

df.display()

In [0]:
df.write.format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{bronze_schema}.{data_source}')

In [0]:
spark.read.table('sportats.bronze.products').limit(10).display()

**SILVER PRODUCTS TABLE LAYER**

In [0]:
df_bronze = spark.read.table(f'{catalog}.{bronze_schema}.{data_source}')

In [0]:
df_bronze.limit(10).display()

In [0]:
#so we want to tale care of the messy data in this data set. Some of which ,ay include splitting columns, capitalizing certain texts, spelling errors, etc.

In [0]:
print('rows before dropping dupes:', df_bronze.count())
df_silver = df_bronze.dropDuplicates(subset = ['product_id'])
print('rows after dropping dupes:', df_silver.count())


In [0]:
#Title case

df_silver = df_silver.withColumn('category', when(col("category").isNull(), None)\
    .otherwise(initcap('category')))

df_silver.limit(10).display()

In [0]:
df_silver.select('category').distinct().display()

In [0]:
mapping = {
    'Protien Bars' : 'Protein Bars'
}

df_silver = df_silver.replace(mapping, subset=['category'])
#df_silver.replace('Protien Bars', 'Protein Bars', subset = ['category'])
df_silver.select('category').distinct().display()

In [0]:
df_silver = df_silver.withColumn('product_name', 
                                 regexp_replace(col('product_name'), "(?i)Protien", "Protein"))

df_silver.select('product_name').distinct().limit(10).display()

In [0]:
display(df_silver)

In [0]:
#To match the schema of the gold layer from Atlikon data, we need to create a variant column

df_silver = df_silver.withColumn('division', when(col('category') == 'Energy Bars', 'Nutrition Bars')\
                    .when(col('category') == 'Protein Bars', 'Nutrition Bars')\
                    .when(col('category') == 'Granola & Cereals', 'Breakfast Foods')\
                    .when(col('category') == 'Recovery Dairy', 'Dairy & Recovery')\
                    .when(col('category') == 'Healthy Snacks', 'Healthy Snacks')\
                    .when(col('category') == 'Electrolyte Mix', 'Hydration & Electrolytes')\
                    .otherwise('Other')
)

In [0]:
df_silver.select('division').display()

In [0]:
df_silver.display()

In [0]:
df_silver = df_silver.withColumn(
    "variant",
    regexp_extract(
        col("product_name"),
        r"\((\d+\s*[A-Za-z]+)\)",
        1
    )
)

In [0]:
df_silver = df_silver.withColumn(
    "product_code", sha2(col('product_name').cast('string'), 256))\
        .withColumn(
            'product_id',
            when(col('product_id').cast('string').rlike("^[0-9]+$"), col('product_id').cast('string'))\
                .otherwise(lit(999999).cast('string'))
        )\
            .withColumnRenamed('product_name', 'product')

In [0]:
df_silver.display()

In [0]:
df_silver = df_silver.select('product_code', "division", "category", "product", 'variant', 'product_id', "read_timestamp", "file_name", 'file_size')

In [0]:
df_silver.write.mode('overwrite')\
    .format('delta')\
    .option('delta.enableChangeDataFeed', 'true')\
    .option('mergeSchema', 'true')\
    .saveAsTable(f'{catalog}.{silver_schema}.{data_source}')

In [0]:
spark.read.table(f'{catalog}.{silver_schema}.{data_source}').display()

**PRODUCTS TABLE GOLD** **LAYER**

In [0]:

df_silver = spark.read.table(f'{catalog}.{silver_schema}.{data_source}')
df_gold = df_silver.select('product_code', 'product_id', 'division', 'category', 'product', 'variant')
df_gold.display()

In [0]:
df_gold.write.mode('overwrite')\
    .format('delta')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{gold_schema}.sb_dim_{data_source}')

In [0]:
spark.read.table(f'{catalog}.{gold_schema}.sb_dim_{data_source}').display()

In [0]:
df_gold_child = df_gold.select('product_code', 'division', 'category', 'product', 'variant')
df_gold_child.display()

In [0]:
delta_obj = DeltaTable.forName(spark, 'sportats.gold.dim_products')

In [0]:
delta_obj.alias("trg").merge(df_gold_child.alias("src"), "trg.product_code = src.product_code")\
    .whenMatchedUpdate(
        set = {
            'division': 'src.division',
            'category': 'src.category',
            'product': 'src.product',
            'variant' : 'src.variant'
        }
    )\
    .whenNotMatchedInsert(
        values = {
            'product_code': 'src.product_code',
            'division': 'src.division',
            'category': 'src.category',
            'product': 'src.product',
            'variant': 'src.variant'
        }
    )\
    .execute()

In [0]:
spark.read.table('sportats.gold.dim_products').display()