In [0]:
df_erp_px_cat = spark.table(
    "bronze.erp_px_cat_g1v2"
)

In [0]:
df_erp_px_cat.display()

- Check the Schema

In [0]:
df_erp_px_cat.printSchema()

- Rename Column Names

In [0]:
mapping_column_names = {
  'ID': 'product_key',
  'CAT': 'category',
  'SUBCAT': 'subcategory',
  'MAINTENANCE': 'maintenance'
}

In [0]:
# create a function that renames the column name

def rename_column(dataframe, old_column, new_column):
    return dataframe.withColumnRenamed(
        old_column, new_column
    )

In [0]:
# update the column names

for old_col, new_col in mapping_column_names.items():
    df_erp_px_cat = rename_column(dataframe=df_erp_px_cat, old_column=old_col, new_column=new_col)

In [0]:
df_erp_px_cat.display()

- Fix product_key column, keep only the first 2 chars, in order to make the join we the other product_key column

In [0]:
from pyspark.sql.functions import col, substring

df_erp_px_cat = df_erp_px_cat.withColumn(
    'product_key',
    substring(col("product_key"), 1, 2)
)

In [0]:
df_erp_px_cat.display()

- Trim Columns

In [0]:
# remove extra spaces using trim function
from pyspark.sql.functions import trim, col

for idx, column in enumerate(df_erp_px_cat.columns):
    if df_erp_px_cat.dtypes[idx][1] == 'string':
        df_erp_px_cat = df_erp_px_cat.withColumn(
            column, trim(col(column))
        )

In [0]:
df_erp_px_cat.display()

- Check values

In [0]:
for col in df_erp_px_cat.columns:
  df_erp_px_cat.groupBy(col).count().show()

In [0]:
df_erp_px_cat.display()

- Check for Nulls

In [0]:
from pyspark.sql.functions import col,isnan, when, count

df_erp_px_cat.select([count(when(col(c).isNull(), c)).alias(c) for c in df_erp_px_cat.columns]
   ).display()


- Export as silver table

In [0]:

df_erp_px_cat.write.format("delta").mode("overwrite").saveAsTable("silver.erp_px_cat_g1v2")