## Transform: CountryRegionCurrency

In [247]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [248]:
from pyspark.sql.functions import lit
from delta.tables import DeltaTable
import json

In [249]:
with open('config.json', 'r') as f:
    config = json.load(f)

DATABASE = config["general"]["database"]
SCHEMA = config["general"]["schema"]
RAW_CATALOG = f'{config["general"]["user"]}_raw'
STG_CATALOG = f'{config["general"]["user"]}_stg'
tables_list = list(config["tables"].keys())
tables_dict = config["tables"]

In [250]:
cmd = f"DROP SCHEMA IF EXISTS {STG_CATALOG}.{SCHEMA} CASCADE"
spark.sql(cmd)
create_schema = f"CREATE SCHEMA IF NOT EXISTS {STG_CATALOG}.{SCHEMA}"
spark.sql(create_schema)


In [251]:
def read_raw_df(table_name):
    spark.sql(f"USE CATALOG {RAW_CATALOG}")
    raw_df = spark.table(f"{RAW_CATALOG}.{SCHEMA}.{table_name}")
    return raw_df

In [252]:
def save_stg_df(stg_df, table_name):
    spark.sql(f"USE CATALOG {STG_CATALOG}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {STG_CATALOG}.{SCHEMA}")
    stg_df.write.format("delta").mode("append").saveAsTable(f"{STG_CATALOG}.{SCHEMA}.{table_name}_stg")


In [253]:
def rename_columns(df, columns_map):
    df = df.select(list(columns_map.keys()))
    for original, alias in columns_map.items():
        df = df.withColumnRenamed(original, alias)
    
    return df

In [254]:

def insert_origin_columns(df, schema):
    df = df.withColumn("source_name", lit("adventure_works"))\
           .withColumn("source_departament", lit(SCHEMA))
    return df

## Table 1: CountryRegionCurrency

In [255]:
for table in tables_list:
    print(table)
    raw_df = read_raw_df(table)
    columns_map = tables_dict[table]["stg_columns"]
    stg_df = rename_columns(raw_df, columns_map)
    stg_df = insert_origin_columns(stg_df, SCHEMA)
    stg_table_name = f"{STG_CATALOG}.{SCHEMA}.{tables_dict[table]['stg_name']}_stg"
    primary_key = tables_dict[table]["primary_key"]
    
    table_exists = spark.catalog.tableExists(stg_table_name)

    if table_exists:
        if raw_df.count():
            primary_key_columns = tables_dict[table]["primary_key"]
            merge_condition = " AND ".join([f"target.{col} = source.{col}" for col in primary_key_columns])
    
            delta_table = DeltaTable.forName(spark, stg_table_name)
            # Perform the merge operation if the table exists
            delta_table.alias("target").merge(
                stg_df.alias("source"),
                merge_condition
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
            print('Nothing to update')
    else:
        # Create the Delta table if it doesn't exist
        stg_df.write.format("delta").saveAsTable(stg_table_name )

CountryRegionCurrency
create
CreditCard
create
Currency
create
CurrencyRate
create
Customer
create
PersonCreditCard
create
SalesOrderDetail
create
SalesOrderHeader
create
SalesOrderHeaderSalesReason
create
SalesPerson
create
SalesPersonQuotaHistory
create
SalesReason
create
SalesTaxRate
create
SalesTerritory
create
SalesTerritoryHistory
create
ShoppingCartItem
create
SpecialOffer
create
SpecialOfferProduct
create
Store
create
