- import 'crm_cust_info' dataset-table

In [0]:
df_crm_cust_info = spark.table(
 "workspace.bronze.crm_cust_info"
)

In [0]:
from pyspark.sql.functions import col,isnan, when, count
df_crm_cust_info.select([count(when(col(c).isNull(), c)).alias(c) for c in df_crm_cust_info.columns]
   ).show()


- check the dataset

In [0]:
df_crm_cust_info.display()

- check the schema

In [0]:
df_crm_cust_info.printSchema()

In [0]:
df_crm_cust_info.columns

- Rename Column Names

In [0]:
# create a dictionary to map the old-new column names

mapping_col_names = {
    'cst_id': 'customer_id',
    'cst_key': 'customer_key',
    'cst_firstname': 'first_name',
    'cst_lastname': 'last_name',
    'cst_marital_status' : 'marital_status',
    'cst_gndr': 'gender',
    'cst_create_date': 'create_date'
}

In [0]:
# create a function that renames the column name

def rename_column(dataframe, old_column, new_column):
    return dataframe.withColumnRenamed(
        old_column, new_column
    )

In [0]:
# update the column names

for old_col, new_col in mapping_col_names.items():
    df_crm_cust_info = rename_column(dataframe=df_crm_cust_info, old_column=old_col, new_column=new_col)
    

In [0]:
# check the new updated column names
df_crm_cust_info.columns

- find duplicates

In [0]:
df_crm_cust_info.show(10)

In [0]:
from pyspark.sql.functions import col

df_duplicated_id = (
    df_crm_cust_info.groupBy("customer_id")
    .count()
    .filter(col("count") > 1)
)

df_duplicated_id.show()

In [0]:
# drop customer_id with null as value

df_crm_cust_info = df_crm_cust_info.dropna(subset='customer_id')

In [0]:
# check rows with the duplicated id values in order to determine, which rows to keep
ids = [29433, 29449, 29466, 29473, 29483]

df_crm_cust_info.filter(col("customer_id").isin(ids)).display(truncate=False)

In [0]:
 # keep only rows with no nulls in first-last name columns
 
df_crm_cust_info = df_crm_cust_info.filter(
     col("first_name").isNotNull() & col("last_name").isNotNull() & col("gender").isNotNull()
 )

In [0]:
df_duplicated_id = (
    df_crm_cust_info.groupBy("customer_id")
    .count()
    .filter(col("count") > 1)
)

df_duplicated_id.show()

In [0]:
df_crm_cust_info.display()

- Validate string values: Check extra spaces, Identify abbreviations to normalize

In [0]:
# remove extra spaces using trim function
from pyspark.sql.functions import trim, col

for idx, column in enumerate(df_crm_cust_info.columns):
    if df_crm_cust_info.dtypes[idx][1] == 'string':
        df_crm_cust_info = df_crm_cust_info.withColumn(
            column, trim(col(column))
        )


In [0]:
df_crm_cust_info.display()

- Validate gender - marital_status columns

In [0]:
df_crm_cust_info.groupBy('gender').count().show()

In [0]:
df_crm_cust_info.groupBy("marital_status").count().show()

 - Validate dates values: Check Data Type, check the format, handle missing values

In [0]:
# validate data type
df_crm_cust_info.select('create_date').printSchema()

In [0]:
# check the format
df_crm_cust_info.select("create_date").show(10000)

In [0]:
from pyspark.sql.functions import col,isnan, when, count
df_crm_cust_info.select([count(when(col(c).isNull(), c)).alias(c) for c in df_crm_cust_info.columns]
   ).show()


In [0]:
df_crm_cust_info.display()