In [0]:
df_erp_customers = spark.table("bronze.erp_cust_az12")

In [0]:
df_erp_customers.display()

- Check the Schema

In [0]:
df_erp_customers.printSchema()

- Rename Column Names

In [0]:
mapping_column_names = {
  'CID': 'customer_id',
  'BDATE': 'birth_date',
  'GEN': 'gender'
}

In [0]:
# create a function that renames the column name

def rename_column(dataframe, old_column, new_column):
    return dataframe.withColumnRenamed(
        old_column, new_column
    )

In [0]:
# update the column names

for old_col, new_col in mapping_column_names.items():
    df_erp_customers = rename_column(dataframe=df_erp_customers, old_column=old_col, new_column=new_col)

In [0]:
print(df_erp_customers.columns)

In [0]:
df_erp_customers.display()

- Find Duplicates on customer_key column

In [0]:
from pyspark.sql.functions import col

df_erp_customers.groupBy("customer_id").count().filter(
  col("count") > 1
).display()

- Trim columns

In [0]:
# remove extra spaces using trim function
from pyspark.sql.functions import trim, col

for idx, column in enumerate(df_erp_customers.columns):
    if df_erp_customers.dtypes[idx][1] == 'string':
        df_erp_customers = df_erp_customers.withColumn(
            column, trim(col(column))
        )

In [0]:
df_erp_customers.display()

- Check for Nulls

In [0]:
from pyspark.sql.functions import col,isnan, when, count

df_erp_customers.select([count(when(col(c).isNull(), c)).alias(c) for c in df_erp_customers.columns]
   ).display()


In [0]:
%sql
SELECT *
FROM bronze.erp_cust_az12
WHERE GEN IS NULL;

- Map null values in gender column, as unknown

In [0]:
from pyspark.sql.functions import col, when

df_erp_customers = df_erp_customers.withColumn(
  'gender',
  when(
    col("gender").isNull(), "unknown"
  ).otherwise(col("gender"))
)

In [0]:
from pyspark.sql.functions import col,isnan, when, count

df_erp_customers.select([count(when(col(c).isNull(), c)).alias(c) for c in df_erp_customers.columns]
   ).display()


In [0]:
df_erp_customers.display()

- Extract as silver table

In [0]:

df_erp_customers.write.format("delta").mode("overwrite").saveAsTable("silver.erp_customer_az12")