- read the 'crm_prd_info' table

In [0]:
df_crm_prd_info = spark.table("bronze.crm_prd_info")

In [0]:
df_crm_prd_info.display()

- Check The Schema

In [0]:
df_crm_prd_info.printSchema()

- Rename All column names

In [0]:
# create a dictionary to map the old-new column names
mapping_col_names = {
  'prd_id': 'product_id',
  'prd_key': 'product_key',
  'prd_nm': 'product_name',
  'prd_cost': 'product_cost',
  'prd_line': 'product_line',
  'prd_start_dt': 'product_start_date',
  'prd_end_dt': 'product_end_date'
}

In [0]:
# create a function that renames the column name

def rename_column(dataframe, old_column, new_column):
    return dataframe.withColumnRenamed(
        old_column, new_column
    )

In [0]:
# update the column names

for old_col, new_col in mapping_col_names.items():
    df_crm_prd_info = rename_column(dataframe=df_crm_prd_info, old_column=old_col, new_column=new_col)

In [0]:
print(df_crm_prd_info.columns)

In [0]:
df_crm_prd_info.display()

- Find Duplicates

In [0]:
df_crm_prd_info.display()

In [0]:
from pyspark.sql.functions import col

# function which checks for duplicates

def check_duplicates(dataframe, column):
    return dataframe.groupBy(column).count().filter(
        col("count") > 1
    ).show()

In [0]:
check_duplicates(dataframe=df_crm_prd_info, column='product_key')

In [0]:
%sql
SELECT *
FROM bronze.crm_prd_info
WHERE prd_key IN ('CO-RF-FR-R92R-44', 'AC-HE-HL-U509-R')


- Trim Extra Spaces

In [0]:
# remove extra spaces using trim function
from pyspark.sql.functions import trim, col

for idx, column in enumerate(df_crm_prd_info.columns):
    if df_crm_prd_info.dtypes[idx][1] == 'string':
        df_crm_prd_info = df_crm_prd_info.withColumn(
            column, trim(col(column))
        )

In [0]:
df_crm_prd_info.display()

- Fix product_key column, keep only the first 2 chars

In [0]:
from pyspark.sql.functions import col, substring

df_crm_prd_info = df_crm_prd_info.withColumn(
    'product_key',
    substring(col("product_key"), 1, 2)
)

- Validate dates values: Check Data Type, check the format, handle missing values

In [0]:
df_crm_prd_info.select("product_start_date", "product_end_date").printSchema()

In [0]:
df_crm_prd_info.select("product_start_date", "product_end_date").show(50)

- handle missing values

In [0]:
from pyspark.sql.functions import col,isnan, when, count
df_crm_prd_info.select([count(when(col(c).isNull(), c)).alias(c) for c in df_crm_prd_info.columns]
   ).show()


* handle null values in product_cost column

In [0]:
df_crm_prd_info.select('product_cost').show()

In [0]:
%sql
SELECT prd_key
FROM bronze.crm_prd_info
WHERE prd_cost IS NULL

--- we keep the nulls on this silver-stage

- Handle missing values on the product_line column

In [0]:
df_crm_prd_info.groupBy("product_line").count().show()

In [0]:
# replace nulls in this col with 'unknown'

df_crm_prd_info = df_crm_prd_info.na.fill({"product_line": 'Unknown'})

In [0]:
df_crm_prd_info.groupBy("product_line").count().show()

- Export the final dataset as silver table

In [0]:
df_crm_prd_info.write.format("delta").mode("overwrite").saveAsTable("silver.crm_prd_info")