**IMPORTS**

In [0]:
from pyspark.sql.functions import *

**Importing bronze_raw table**


In [0]:
#importing a table from workspace and displaying it
bronze_raw = spark.table("bronze_raw")
display(bronze_raw)


**Filtering the NULL Valued Columns as well as which are " "**

In [0]:
# checking for null values as well as blank columns
checking_data = bronze_raw.filter(
    col("first_name").isNull() | (trim(col("first_name")) == "") |
    col("age").isNull() | (trim(col("age")) == "") |
    col("email").isNull() | (trim(col("email")) == "") |
    col("phone_no").isNull() | (trim(col("phone_no")) == "")
)
checking_data.show()

**Checking on Duplicate Rows**

In [0]:
#checking on duplicate rows in the data
duplicates=bronze_raw.dropDuplicates()
display(bronze_raw)
# means there are no duplicate data as the number of rows are the same

**Casting BigInt to int due to age column**

In [0]:
#casting bigint to int ad age is usually between 10 and 100
bronze_raw.withColumn("age", col("age").cast("int"))

**removing the rows with age lesser than 0**

In [0]:
# Keeping only rows where age is between 10 and 100 
bronze_raw = bronze_raw.filter((col("age") >= 0))
display(bronze_raw)

**filtering where age is not number**

In [0]:
#  Filter out rows where age is not a number
bronze_raw= bronze_raw.filter(col("age").rlike("^[0-9]+$"))
display(bronze_raw)

**Keeping the First letter capital for both first_name and last_name**

In [0]:
#updating the first name and last names with their first letters as capital and else small
bronze_raw = bronze_raw.withColumn("first_name", initcap("first_name")) \
       .withColumn("last_name", initcap("last_name"))
display(bronze_raw)

**Trimming and lowercase for email**

In [0]:
#trimming the space in email column and making it lowercase
bronze_raw = bronze_raw.withColumn("email", lower(trim(col("email"))))
display(bronze_raw)


**checking the format for email**

In [0]:
#created an email_regex for pattern matching and filtering data of email on the basis of that email_regex
email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

bronze_raw = bronze_raw.filter(col("email").rlike(email_regex))
display(bronze_raw)


**making new column for checking valid email or not**

In [0]:
#made a new column for email named as email_valid to check if the email in that row is valid or not
bronze_raw = bronze_raw.withColumn(
    "email_valid",
    when(col("email").rlike(email_regex), True).otherwise(False)
)
display(bronze_raw)

In [0]:
bronze_raw.filter(col('email_valid')==False).count()

**improving the format for phont_no field**

In [0]:
# Removing extension part starting with 'x' (case-insensitive)
bronze_raw = bronze_raw.withColumn("phone_no_clean",
                   regexp_replace(col("phone_no"), "(?i)x.*", ""))
display(bronze_raw)


cleaning up the phone_no column

In [0]:
# Removing spaces, dashes, dots, parentheses
bronze_raw = bronze_raw.withColumn("phone_no_clean",
                   regexp_replace(col("phone_no_clean"), "[\\s\\-\\.\\(\\)]", ""))
display(bronze_raw)


In [0]:
#remove leading country codes like +1 or 001
bronze_raw = bronze_raw.withColumn("phone_no_clean",
                   regexp_replace(col("phone_no_clean"), "^(\\+1|001)", ""))
display(bronze_raw)


In [0]:
# validating phone number length
bronse_raw=bronze_raw.withColumn("phone_valid",
                   (col("phone_no_clean").rlike("^[0-9]{10}$")))
display(bronze_raw)


In [0]:
from pyspark.sql import functions as F
filtered_df = bronze_raw.filter(bronze_raw.phone_no_clean.isNotNull() & 
    (F.length(F.col("phone_no_clean")) == 10))
display(filtered_df)

In [0]:
bronze_raw = bronze_raw.drop("phone_no")
display(bronze_raw)

In [0]:
spark.sql("DROP TABLE IF EXISTS bronze_stage")


**Sent the validated table to bronze_stage**

In [0]:
#sent the validated table to the new table bronze_stage table
filtered_df.write \
    .format("delta") \
    .saveAsTable("default.bronze_stage")


In [0]:
%sql
SELECT * FROM bronze_stage