In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/eshivee@gmail.com/Atlikon_SportsBar_Data_Pipeline/Set_Up/utilities

In [0]:
dbutils.widgets.text('catalog', 'sportats', 'Catalog')
catalog = dbutils.widgets.get('catalog')
dbutils.widgets.text('data_source', 'customers', 'Data Source')
data_source = dbutils.widgets.get('data_source')

In [0]:
print(catalog, data_source)

In [0]:
base_path = f's3://sportsbarsa/{data_source}/*.csv'
print(base_path)

In [0]:
df = spark.read.format('csv')\
    .option('header', True)\
    .option('inferSchema', True)\
    .load(base_path)

In [0]:
df = df.withColumn('read_timestamp', current_timestamp())\
    .select('*', '_metadata.file_name', '_metadata.file_size')

In [0]:
df.write.format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

**SILVER LAYER CUSTOMERS TABLE**

In [0]:
df_bronze = spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")

In [0]:
df_bronze.limit(10).display()

In [0]:
df_silver = df_bronze.dropDuplicates(["customer_id"])
df_silver.display()

In [0]:
df_silver = df_silver.withColumn('customer_name', trim('customer_name'))

In [0]:
df_silver.select('city').distinct().display()

In [0]:
allowed_cities = ['New Delhi', 'Bengaluru', 'Hyderabad']

city_mapping = {
    'Bengaluruu' : 'Bengaluru',
    'Bengalore' : 'Bengaluru',
    'Hyderabadd' : 'Hyderabad',
    'Hyderbad' : 'Hyderabad',
    'NewDelhi ' : 'New Delhi',
    'NewDheli' : 'New Delhi',
    'NewDelhee' : 'New Delhi'
}

df_silver = df_silver.replace(city_mapping, subset = ['city'])\
                      .withColumn('city', when(col('city').isNull(), None)\
                                         .when(col('city').isin(allowed_cities), col('city'))\
                                         .otherwise(None))

In [0]:
df_silver.select(col('city')).distinct().display()

In [0]:
df_silver.withColumn('customer_name', 
                    when(col('customer_name').isNull(), None)\
                    .otherwise(initcap('customer_name'))).select('customer_name').distinct().display()

In [0]:
df_silver = df_silver.withColumn('customer_name', 
                    when(col('customer_name').isNull(), None)\
                    .otherwise(initcap('customer_name')))

In [0]:
df_silver.filter(col('city').isNull()).display()

In [0]:
customer_name_with_null_city = ['Sprintx Nutrition', 'Zenathlete Foods', 'Peak Performance Store', 'Primefuel Nutrition', 'Recovery Lane']

df_silver.filter(col('customer_name').isin(customer_name_with_null_city)).display()

In [0]:
customer_city_fix = {
    789403 : "New Delhi",
    789420 : "Bengaluru",
    789503 : "New Delhi",
    789521 : "Hyderabad",
    789603 : "Hyderabad"
}

df_fix = spark.createDataFrame(
    [(k,v) for k,v in customer_city_fix.items()],
    ["customer_id", "fixed_city"]
)

In [0]:
df_fix.display()

In [0]:
df_silver = (
    df_silver.join(df_fix, 'customer_id', 'left')
)

In [0]:
df_silver.display()

In [0]:
#retrieve an entry for the rows with a null values using coalesce

df_silver = df_silver.withColumn('city', coalesce('city', 'fixed_city')).drop('fixed_city')
df_silver.display()

In [0]:
df_silver = df_silver.withColumn('customer_id', col('customer_id').cast('string'))

#since we are not using the customer_id to make any form of aggregation, cast it to string

df_silver.printSchema()

In [0]:
df_silver.limit(10).display()

In [0]:
df_silver = df_silver.withColumn('customer', concat_ws('-', 'customer_name', coalesce(col('city'), lit('unknown'))))\
                     .withColumn('market', lit('India'))\
                     .withColumn('platform', lit('Sports Bar'))\
                     .withColumn('channel', lit('Acquisition'))

df_silver.display(5)

In [0]:
df_silver.write.format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed', 'true')\
    .option('mergeSchema', 'true')\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

**GOLD LAYER CUSTOMERS**

In [0]:
df_silver = spark.read.table(f'{catalog}.{silver_schema}.{data_source}')


In [0]:
df_gold = df_silver.select('customer_id', 'customer_name', 'city', 'customer', 'market', 'platform', 'channel')

In [0]:
df_gold.display()

In [0]:
df_gold.write.format('delta')\
    .mode('overwrite')\
    .option('delta.enableChangeDataFeed', 'true')\
    .saveAsTable(f'{catalog}.{gold_schema}.sb_dim_{data_source}')

In [0]:
#Lets now merge the child table to the parent table

deltaTable = DeltaTable.forName(spark, 'sportats.gold.dim_customers')
df_gold_child = spark.read.table('sportats.gold.sb_dim_customers').select(col('customer_id').alias('customer_code'), col('customer'), col('market'), col('platform'), col('channel'))

In [0]:
deltaTable.alias('trg').merge(df_gold_child.alias('src'), 'trg.customer_code = src.customer_code')\
    .whenMatchedUpdateAll()\
    .whenNotMatchedInsertAll()\
    .execute()

In [0]:
spark.read.table('sportats.gold.dim_customers').display()