## Transform: CountryRegionCurrency

In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F


SCHEMA = 'sales'
RAW_CATALOG = 'antonio_junior_raw'
STG_CATALOG = "antonio_junior_stg"

In [57]:
def read_raw_df(table_name):
    spark.sql(f"USE CATALOG {RAW_CATALOG}")
    raw_df = spark.table(f"{RAW_CATALOG}.{SCHEMA}.{table_name}")
    return raw_df

In [84]:
def save_stg_df(stg_df, table_name):
    spark.sql(f"USE CATALOG {STG_CATALOG}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {STG_CATALOG}.{SCHEMA}")
    stg_df.write.format("delta").mode("overwrite").saveAsTable(f"{STG_CATALOG}.{SCHEMA}.{table_name}_stg")


In [73]:
def rename_columns(df, columns_map):
    df = df.select(list(columns_map.keys()))
    for original, alias in columns_map.items():
        df = df.withColumnRenamed(original, alias)
    
    return df

## Table 1: CountryRegionCurrency

In [85]:
raw_df = read_raw_df("CountryRegionCurrency")
raw_df.printSchema()
columns_map = {"CountryRegionCode": "country_region_code",
               "CurrencyCode": "currency_code",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'country_region_currency')

root
 |-- CountryRegionCode: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 2: CreditCard

In [86]:
raw_df = read_raw_df("CreditCard")
raw_df.printSchema()
columns_map = {"CreditCardID": "credit_card_id",
               "CardType": "card_type",
               "CardNumber": "card_number",
               "ExpMonth": "exp_month",
               "ExpYear": "exp_year",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'credit_card')

root
 |-- CreditCardID: integer (nullable = true)
 |-- CardType: string (nullable = true)
 |-- CardNumber: string (nullable = true)
 |-- ExpMonth: short (nullable = true)
 |-- ExpYear: short (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 3: Currency

In [87]:
raw_df = read_raw_df("Currency")
columns_map = {"CurrencyCode": "currency_code",
               "Name": "name",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'currency')

## Table: CurrencyRate

In [88]:
raw_df = read_raw_df("CurrencyRate")
raw_df.printSchema()

columns_map = {"CurrencyRateID": "currency_rate_id",
               "CurrencyRateDate": "currency_rate_date",
               "FromCurrencyCode": "from_currency_code",
               "ToCurrencyCode": "to_currency_code",
               "AverageRate": "average_rate",
               "EndOfDayRate": "end_of_day_rate",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'currency_rate')

root
 |-- CurrencyRateID: integer (nullable = true)
 |-- CurrencyRateDate: timestamp (nullable = true)
 |-- FromCurrencyCode: string (nullable = true)
 |-- ToCurrencyCode: string (nullable = true)
 |-- AverageRate: decimal(19,4) (nullable = true)
 |-- EndOfDayRate: decimal(19,4) (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 4: Customer

In [89]:
raw_df = read_raw_df("Customer")
raw_df.printSchema()
columns_map = {"CustomerID": "customer_id",
               "PersonID": "person_id",
               "StoreID": "store_id",
               "TerritoryID": "territory_id",
               "AccountNumber": "account_number",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'customer')

root
 |-- CustomerID: integer (nullable = true)
 |-- PersonID: integer (nullable = true)
 |-- StoreID: integer (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



# Table 5: PersonCreditCard

In [90]:
raw_df = read_raw_df("PersonCreditCard")
raw_df.printSchema()
columns_map = {"BusinessEntityID": "business_entity_id",
               "CreditCardID": "credit_card_id",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'person_credit_card')

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- CreditCardID: integer (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 6: SalesOrderDetail

In [91]:
raw_df = read_raw_df("SalesOrderDetail")
raw_df.printSchema()
columns_map = {"SalesOrderID": "sales_order_id",
               "SalesOrderDetailID": "sales_order_detail_id",
               "CarrierTrackingNumber": "carrier_tracking_number",
               "OrderQty": "order_qty",
               "ProductID": "product_id",
               "SpecialOfferID": "special_offer_id",
               "UnitPrice": "unit_price",
               "UnitPriceDiscount": "unit_price_discount",
               "LineTotal": "line_total",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_order_detail')

root
 |-- SalesOrderID: integer (nullable = true)
 |-- SalesOrderDetailID: integer (nullable = true)
 |-- CarrierTrackingNumber: string (nullable = true)
 |-- OrderQty: short (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- SpecialOfferID: integer (nullable = true)
 |-- UnitPrice: decimal(19,4) (nullable = true)
 |-- UnitPriceDiscount: decimal(19,4) (nullable = true)
 |-- LineTotal: decimal(38,6) (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 7: SalesOrderHeader

In [92]:
raw_df = read_raw_df("SalesOrderHeader")
raw_df.printSchema()
columns_map = {"SalesOrderID": "sales_order_id",
               "RevisionNumber": "revision_number",
               "OrderDate": "order_date",
               "DueDate": "due_date",
               "ShipDate": "ship_date",
               "Status": "status",
               "OnlineOrderFlag": "online_order_flag",
               "SalesOrderNumber": "sales_order_number",
               "PurchaseOrderNumber": "purchase_order_number",
               "AccountNumber": "account_number",
               "CustomerID": "customer_id",
               "SalesPersonID": "sales_person_id",
               "TerritoryID": "territory_id",
               "BillToAddressID": "bill_to_address_id",
               "ShipToAddressID": "ship_to_address_id",
               "ShipMethodID": "ship_method_id",
               "CreditCardID": "credit_card_id",
               "CreditCardApprovalCode": "credit_card_approval_code",
               "CurrencyRateID": "currency_rate_id",
               "SubTotal": "sub_total",
               "TaxAmt": "tax_amt",
               "Freight": "freight",
               "TotalDue": "total_due",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_order_header')

root
 |-- SalesOrderID: integer (nullable = true)
 |-- RevisionNumber: short (nullable = true)
 |-- OrderDate: timestamp (nullable = true)
 |-- DueDate: timestamp (nullable = true)
 |-- ShipDate: timestamp (nullable = true)
 |-- Status: short (nullable = true)
 |-- OnlineOrderFlag: boolean (nullable = true)
 |-- SalesOrderNumber: string (nullable = true)
 |-- PurchaseOrderNumber: string (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- SalesPersonID: integer (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- BillToAddressID: integer (nullable = true)
 |-- ShipToAddressID: integer (nullable = true)
 |-- ShipMethodID: integer (nullable = true)
 |-- CreditCardID: integer (nullable = true)
 |-- CreditCardApprovalCode: string (nullable = true)
 |-- CurrencyRateID: integer (nullable = true)
 |-- SubTotal: decimal(19,4) (nullable = true)
 |-- TaxAmt: decimal(19,4) (nullable = true)
 |-- Freight: decimal(19,4) (nullab

## Table 8: SalesOrderHeaderSalesReason

In [93]:
raw_df = read_raw_df("SalesOrderHeaderSalesReason")
raw_df.printSchema()
columns_map = {"SalesOrderID": "sales_order_id",
               "SalesReasonId": "sales_reason_id",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_order_header_sales_reason')

root
 |-- SalesOrderID: integer (nullable = true)
 |-- SalesReasonID: integer (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 9: SalesPerson

In [95]:
raw_df = read_raw_df("SalesPerson")
raw_df.printSchema()
columns_map = {"BusinessEntityID": "business_entity_id",
               "TerritoryID": "territory_id",
               "SalesQuota": "sales_quota",
               "Bonus": "bonus",
               "CommissionPct": "commission_pct",
               "SalesYTD": "sales_ytd",
               "SalesLastYear": "sales_last_year",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_person')

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- SalesQuota: decimal(19,4) (nullable = true)
 |-- Bonus: decimal(19,4) (nullable = true)
 |-- CommissionPct: decimal(10,4) (nullable = true)
 |-- SalesYTD: decimal(19,4) (nullable = true)
 |-- SalesLastYear: decimal(19,4) (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 10: SalesPersonQuotaHistory

In [97]:
raw_df = read_raw_df("SalesPersonQuotaHistory")
raw_df.printSchema()
columns_map = {"BusinessEntityID": "business_entity_id",
               "QuotaDate": "quota_date",
               "SalesQuota": "sales_quota",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_person_quota_history')

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- QuotaDate: timestamp (nullable = true)
 |-- SalesQuota: decimal(19,4) (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 11: SalesReason

In [98]:
raw_df = read_raw_df("SalesReason")
raw_df.printSchema()
columns_map = {"SalesReasonID": "sales_reason_id",
               "Name": "name",
               "ReasonType": "reason_type",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_reason')

root
 |-- SalesReasonID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- ReasonType: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 12: SalesTaxRate

In [99]:
raw_df = read_raw_df("SalesTaxRate")
raw_df.printSchema()
columns_map = {"SalesTaxRateID": "sales_tax_rate_id",
               "StateProvinceID": "state_province_id",
               "TaxType": "tax_type",
               "TaxRate": "tax_rate",
               "Name": "name",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_tax_rate')

root
 |-- SalesTaxRateID: integer (nullable = true)
 |-- StateProvinceID: integer (nullable = true)
 |-- TaxType: short (nullable = true)
 |-- TaxRate: decimal(10,4) (nullable = true)
 |-- Name: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 13: SalesTerritory

In [100]:
raw_df = read_raw_df("SalesTerritory")
raw_df.printSchema()
columns_map = {"TerritoryID": "territory_id",
               "Name": "name",
               "CountryRegionCode": "country_region_code",
               "Group": "group",
               "SalesYTD": "sales_ytd",
               "SalesLastYear": "sales_last_year",
               "CostYTD": "cost_ytd",
               "CostLastYear": "cost_last_year",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_territory')

root
 |-- TerritoryID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- CountryRegionCode: string (nullable = true)
 |-- Group: string (nullable = true)
 |-- SalesYTD: decimal(19,4) (nullable = true)
 |-- SalesLastYear: decimal(19,4) (nullable = true)
 |-- CostYTD: decimal(19,4) (nullable = true)
 |-- CostLastYear: decimal(19,4) (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 14: SalesTerritoryHistory

In [101]:
raw_df = read_raw_df("SalesTerritoryHistory")
raw_df.printSchema()
columns_map = {"BusinessEntityID": "business_entity_id",
               "TerritoryID": "territory_id",
               "StartDate": "start_date",
               "EndDate": "end_date",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'sales_territory_history')

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- StartDate: timestamp (nullable = true)
 |-- EndDate: timestamp (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 15: ShoppingCartitem

In [103]:
raw_df = read_raw_df("ShoppingCartItem")
raw_df.printSchema()
columns_map = {"ShoppingCartItemID": "shopping_cart_item_id",
               "ShoppingCartID": "shopping_cart_id",
               "Quantity": "quantity",
               "ProductID": "product_id",
               "DateCreated": "date_created",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'shopping_cart_item')

root
 |-- ShoppingCartItemID: integer (nullable = true)
 |-- ShoppingCartID: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- DateCreated: timestamp (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 16: SpecialOffer

In [104]:
raw_df = read_raw_df("SpecialOffer")
raw_df.printSchema()
columns_map = {"SpecialOfferID": "special_offer_id",
               "Description": "description",
               "DiscountPct": "discount_pct",
               "Type": "type",
               "Category": "category",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'special_offer')

root
 |-- SpecialOfferID: integer (nullable = true)
 |-- Description: string (nullable = true)
 |-- DiscountPct: decimal(10,4) (nullable = true)
 |-- Type: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- StartDate: timestamp (nullable = true)
 |-- EndDate: timestamp (nullable = true)
 |-- MinQty: integer (nullable = true)
 |-- MaxQty: integer (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 17: SpecialOfferProduct

In [105]:
raw_df = read_raw_df("SpecialOfferProduct")
raw_df.printSchema()
columns_map = {"SpecialOfferID": "special_offer_id",
               "ProductID": "product_id",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'special_offer_product')

root
 |-- SpecialOfferID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



## Table 18: Store

In [106]:
raw_df = read_raw_df("Store")
raw_df.printSchema()
columns_map = {"BusinessEntityID": "business_entity_id",
               "Name": "name",
               "SalesPersonID": "sales_person_id",
               "Demographics": "demographics",
               "ModifiedDate": "modified_date"}
stg_df = rename_columns(raw_df, columns_map)
save_stg_df(stg_df, 'store')

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- SalesPersonID: integer (nullable = true)
 |-- Demographics: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)

