# CREATE FLAG PARAMETER

In [0]:
from pyspark.sql.functions import*
from pyspark.sql.types import*

In [0]:
# Creating a widget for incremental flag with a default value of '0'
dbutils.widgets.text('incremental_flag' , '0');

In [0]:
# Retrieve the value of the 'incremental_flag' widget from Databricks
incremental_flag = dbutils.widgets.get('incremental_flag')
# Print the value of the 'incremental_flag' to the console (for debugging or checking)
print(incremental_flag);

0


# CREATING DIMENSION MODEL

FETCH RELATIVE COLUMNS

In [0]:
# Query to load distinct Model_ID and model_category from a Parquet file in Azure Data Lake
df_src = spark.sql('''
    SELECT DISTINCT Date_ID
    FROM PARQUET.`abfss://silver@azstrgaccenterprisecars.dfs.core.windows.net/carsales`
''')

Just bring the schema if the table is not exist

In [0]:
if spark.catalog.tableExists('cars_catalog.gold.dim_date'):
    df_sink = spark.sql('''
        SELECT dim_date_key, Date_ID
        FROM cars_catalog.gold.dim_date
    ''')
else:
    df_sink = spark.sql('''
        SELECT 1 as dim_date_key, Date_ID
        FROM PARQUET.`abfss://silver@azstrgaccenterprisecars.dfs.core.windows.net/carsales`
        WHERE 1=0
    ''')

In [0]:
df_sink.display()

dim_date_key,Date_ID
1,DT00029
2,DT00140
3,DT00192
4,DT00444
5,DT00475
6,DT00947
7,DT00976
8,DT01028
9,DT01099
10,DT00657


### FILTERING new records and old records

In [0]:

df_filter = df_src.join( 
    df_sink, 
    df_src.Date_ID == df_sink.Date_ID, 
    'left' 
).select(
    df_src.Date_ID,
    df_sink.dim_date_key
)


### df_filter_old

In [0]:
df_filter_old = df_filter.filter(df_filter.dim_date_key.isNotNull())
df_filter_old.display()

dim_date_key,Date_ID
1,DT00029
2,DT00140
3,DT00192
4,DT00444
5,DT00475
6,DT00947
7,DT00976
8,DT01028
9,DT01099
10,DT00657


### df_filter_new

In [0]:
df_filter_new = df_filter.filter(df_filter.dim_date_key.isNull()).select(df_filter.Date_ID)

### Create surrogate key

**Fetch the max surrogate key from Existing Table**

In [0]:
if incremental_flag == '0':
  max_value = 1;
else:
  max_value_df = spark.sql('''
    SELECT MAX(dim_date_key) as max_value
    from cars_catalog.gold.dim_date
  ''')
  max_value = max_value_df.collect()[0][0]+1


Create Surrogate Key column and ADD the max surrogate key

In [0]:
from pyspark.sql.functions import monotonically_increasing_id

In [0]:
df_filter_new = df_filter_new.withColumn( 
    'dim_date_key',  
    max_value + monotonically_increasing_id()  # Add the 'max_value' to a unique ID for each row
)


Create Final DF - df_filter_old + df_filter_new

In [0]:
df_final = df_filter_old.union(df_filter_new)  # Perform a union operation to combine the two DataFrames

### SCD TYPE - 1 (UPSERT)

In [0]:
df_final.display()

dim_date_key,Date_ID
1,DT00029
2,DT00140
3,DT00192
4,DT00444
5,DT00475
6,DT00947
7,DT00976
8,DT01028
9,DT01099
10,DT00657


In [0]:
from delta.tables import DeltaTable

In [0]:
# Check if the table exists
if not spark.catalog.tableExists('cars_catalog.gold.dim_date'):
    # Initial Run: The table does not exist, so we write the data for the first time
    df_final.write.format('delta')\
        .mode('overwrite')\
        .option("path", "abfss://gold@azstrgaccenterprisecars.dfs.core.windows.net/dim_date")\
        .saveAsTable('cars_catalog.gold.dim_date')
else:
    # Incremental Run: The table exists, so we perform a merge operation
    delta_tbl = DeltaTable.forPath(spark, "abfss://gold@azstrgaccenterprisecars.dfs.core.windows.net/dim_date")
    delta_tbl.alias("trg").merge(df_final.alias("src"), "trg.dim_date_key = src.dim_date_key")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNumberFormatException[0m                     Traceback (most recent call last)
File [0;32m<command-3208253329048672>, line 14[0m
[1;32m      8[0m [38;5;28;01melse[39;00m:
[1;32m      9[0m     [38;5;66;03m# Incremental Run: The table exists, so we perform a merge operation[39;00m
[1;32m     10[0m     delta_tbl [38;5;241m=[39m DeltaTable[38;5;241m.[39mforPath(spark, [38;5;124m"[39m[38;5;124mabfss://gold@azstrgaccenterprisecars.dfs.core.windows.net/dim_date[39m[38;5;124m"[39m)
[1;32m     11[0m     delta_tbl[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mtrg[39m[38;5;124m"[39m)[38;5;241m.[39mmerge(df_final[38;5;241m.[39malias([38;5;124m"[39m[38;5;124msrc[39m[38;5;124m"[39m), [38;5;124m"[39m[38;5;124mtrg.dim_date_key = src.dim_date_key[39m[38;5;124m"[39m)\
[1;32m     12[0m         [38;5;241m.[39mwhenMatchedUpdateAll()\
[1;32m     13[0m         [

In [0]:
%sql
SELECT * FROM cars_catalog.gold.dim_date

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3208253329048672>, line 4[0m
[1;32m      1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfunctions[39;00m [38;5;28;01mimport[39;00m col
[1;32m      3[0m [38;5;66;03m# Ensure dim_date_key is cast correctly in df_final[39;00m
[0;32m----> 4[0m df_final [38;5;241m=[39m df_final[38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mdim_date_key[39m[38;5;124m"[39m, try_cast(col([38;5;124m"[39m[38;5;124mdim_date_key[39m[38;5;124m"[39m)[38;5;241m.[39mcast([38;5;124m"[39m[38;5;124mBIGINT[39m[38;5;124m"[39m)))
[1;32m      6[0m [38;5;66;03m# Check if the table exists[39;00m
[1;32m      7[0m [38;5;28;01mif[39;00m [38;5;129;01mnot[39;00m spark[38;5;241m.[39mcatalog[38;5;24