In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, split, concat, regexp_replace, when, monotonically_increasing_id


In [9]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("DataQualityLayer")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

In [51]:
raw_dataSchema = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("customer_fname", StringType(), nullable=False),
    StructField("customer_lname", StringType(), nullable=False),
    StructField("cusomter_email", StringType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=True),
    StructField("branch_id", StringType(), nullable=True),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("offer_1", StringType(), nullable=True),
    StructField("offer_2", StringType(), nullable=True),
    StructField("offer_3", StringType(), nullable=True),
    StructField("offer_4", StringType(), nullable=True),
    StructField("offer_5", StringType(), nullable=True),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("shipping_address", StringType(), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("hire_date", DateType(), nullable=True),
    StructField("location", StringType(), nullable=True),
    StructField("establish_date", DateType(), nullable=True),
    StructField("class", StringType(), nullable=True),
    StructField("group", StringType(), nullable=False),
    StructField("logs", StringType(), nullable=True),
    StructField("source", StringType(), nullable=True)
])

In [106]:
raw_df = spark.read.csv("/user/q-company/raw_layer_test/all_groups_merged.csv", schema=raw_dataSchema, header=True)

In [107]:
raw_df = raw_df.withColumn("row_index", monotonically_increasing_id())

In [108]:
raw_df.show(n=2, truncate=False, vertical=True)

-RECORD 0---------------------------------------
 transaction_date | 2023-05-20                  
 transaction_id   | trx-152546429674            
 customer_id      | 85469                       
 customer_fname   | Alexander                   
 customer_lname   | Brown                       
 cusomter_email   | alexander.brown@gmail.com"" 
 sales_agent_id   | 1.0                         
 branch_id        | 2.0                         
 product_id       | 22                          
 product_name     | Coffee Maker                
 product_category | Appliances                  
 offer_1          | null                        
 offer_2          | null                        
 offer_3          | null                        
 offer_4          | null                        
 offer_5          | null                        
 units            | 10                          
 unit_price       | 79.99                       
 is_online        | no                          
 payment_method   | 

In [109]:
def remove_blank_columns(df: DataFrame) -> DataFrame:
    return df.select([c for c in df.columns if df.filter(col(c).isNotNull()).count() > 0])


In [110]:
raw_without_blanks_cols = remove_blank_columns(raw_df)

In [111]:
raw_without_blanks_cols.show(2, False, True)

-RECORD 0---------------------------------------
 transaction_date | 2023-05-20                  
 transaction_id   | trx-152546429674            
 customer_id      | 85469                       
 customer_fname   | Alexander                   
 customer_lname   | Brown                       
 cusomter_email   | alexander.brown@gmail.com"" 
 sales_agent_id   | 1.0                         
 branch_id        | 2.0                         
 product_id       | 22                          
 product_name     | Coffee Maker                
 product_category | Appliances                  
 offer_1          | null                        
 offer_2          | null                        
 offer_3          | null                        
 offer_4          | null                        
 offer_5          | null                        
 units            | 10                          
 unit_price       | 79.99                       
 is_online        | no                          
 payment_method   | 

In [112]:
def split_shipping_address(df: DataFrame) -> DataFrame:
    return df.withColumn("shipping_address_split", split(col("shipping_address"), "/")) \
                .withColumn("shipping_street_name", col("shipping_address_split")[0]) \
                .withColumn("shipping_city", col("shipping_address_split")[1]) \
                .withColumn("shipping_state", col("shipping_address_split")[2]) \
                .withColumn("shipping_zip_code", col("shipping_address_split")[3]) \
                .drop("shipping_address", "shipping_address_split")


In [113]:
raw_splited_address = split_shipping_address(raw_without_blanks_cols)

In [114]:
raw_splited_address.filter(raw_splited_address.is_online.isin('yes')).show(2, False, True)

-RECORD 0---------------------------------------
 transaction_date     | 2022-11-24              
 transaction_id       | trx-630807021567        
 customer_id          | 85541                   
 customer_fname       | Ava                     
 customer_lname       | Jones                   
 cusomter_email       | ava.jones@hotmail.com:  
 sales_agent_id       | null                    
 branch_id            | null                    
 product_id           | 27                      
 product_name         | Iron                    
 product_category     | Appliances              
 offer_1              | null                    
 offer_2              | null                    
 offer_3              | null                    
 offer_4              | null                    
 offer_5              | null                    
 units                | 6                       
 unit_price           | 29.99                   
 is_online            | yes                     
 payment_method     

In [115]:
offers_dict = {
    "null": 0.0,
    "offer_1": 0.05,
    "offer_2": 0.1,
    "offer_3": 0.15,
    "offer_4": 0.20,
    "offer_5": 0.20
}

def offers_mapping(offer: str) -> float:
    return offers_dict[offer]

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, coalesce
from pyspark.sql.types import FloatType
from typing import Dict

def map_offers_to_discount(spark: SparkSession, df: DataFrame, offers_dict: Dict[str, float]) -> DataFrame:
    # Broadcast the offers dictionary
    broadcast_offers = spark.sparkContext.broadcast(offers_dict)
    
    # Create a list of offer column names
    offer_columns = ["offer_1", "offer_2", "offer_3", "offer_4", "offer_5"]
    
    # Function to create a when condition for each offer
    def create_offer_column(offer_col: str):
        return when(col(offer_col) == lit(True), lit(broadcast_offers.value[offer_col]))
    
    # Create individual columns for each offer
    offer_discount_columns = [create_offer_column(offer_col).alias(f"{offer_col}_discount") for offer_col in offer_columns]
    
    # Add these columns to the DataFrame
    df_with_offer_discounts = df.select("*", *offer_discount_columns)
    
    # Use coalesce to select the first non-null discount, or default to 0.0
    discount_column = coalesce(*[col(f"{offer_col}_discount") for offer_col in offer_columns], lit(broadcast_offers.value["null"]))
    
    # Add the final discount column and drop the intermediate columns
    return df_with_offer_discounts.withColumn("discount", discount_column.cast(FloatType())) \
                                  .drop(*[f"{offer_col}_discount" for offer_col in offer_columns]) 

result_df = map_offers_to_discount(spark, raw_splited_address, offers_dict)

In [116]:
result_df = result_df.drop("offer_1", "offer_2", "offer_3", "offer_4", "offer_5")

In [117]:
def rename_columns(df: DataFrame) -> DataFrame:
    return df.withColumnRenamed("name", "sales_agent_name") \
                .withColumnRenamed("hire_date", "sales_agent_hire_date") \
                .withColumnRenamed("location", "branch_location") \
                .withColumnRenamed("establish_date", "branch_establish_date") \
                .withColumnRenamed("class", "branch_class")


In [118]:
result_df = rename_columns(result_df)

In [119]:
result_df

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,cusomter_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,discount
2023-05-20,trx-152546429674,85469,Alexander,Brown,alexander.brown@g...,1.0,2.0,22,Coffee Maker,Appliances,10,79.99,no,Cash,John Doe,2020-06-03,Los Angeles,2016-07-28,B,1,0,,,,,0.0
2022-10-25,trx-291375327542,85512,William,Brown,william.brown@gma...,3.0,1.0,24,Blender,Appliances,5,49.99,no,Cash,Michael Johnson,2021-10-03,New York,2017-01-15,A,1,1,,,,,0.2
2022-02-05,trx-312507679871,85484,John,Williams,john.williams@gma...,10.0,3.0,4,Headphones,Electronics,1,99.99,no,Credit Card,Sophia Moore,2019-05-25,Chicago,2015-03-10,A,1,2,,,,,0.0
2023-10-20,trx-193384855491,85528,Alexander,Miller,alexander.miller@...,7.0,2.0,25,Washing Machine,Appliances,8,499.99,no,Cash,Christopher Miller,2020-01-11,Los Angeles,2016-07-28,B,1,3,,,,,0.0
2022-11-17,trx-831626097654,85500,John,Brown,john.brown@hotmai...,5.0,1.0,14,Camera,Electronics,10,399.99,no,Cash,David Wilson,2021-04-08,New York,2017-01-15,A,1,4,,,,,0.15
2022-09-27,trx-158496122054,85545,Sophia,Wilson,sophia.wilson@hot...,4.0,5.0,14,Camera,Electronics,6,399.99,no,Credit Card,Emily Brown,2020-10-25,Phoenix,2017-09-20,C,1,5,,,,,0.2
2022-04-21,trx-722817999024,85561,Alexander,Moore,alexander.moore@y...,4.0,1.0,30,Electric Kettle,Appliances,6,24.99,no,Credit Card,Emily Brown,2020-10-25,New York,2017-01-15,A,1,6,,,,,0.2
2023-04-28,trx-813287633702,85520,Alexander,Wilson,alexander.wilson@...,1.0,1.0,26,Vacuum Cleaner,Appliances,4,199.99,no,Cash,John Doe,2020-06-03,New York,2017-01-15,A,1,7,,,,,0.0
2023-03-08,trx-219568257432,85488,Michael,Miller,michael.miller@ya...,6.0,2.0,18,Boots,Footwear,10,149.99,no,Credit Card,Emma Taylor,2019-03-28,Los Angeles,2016-07-28,B,1,8,,,,,0.0
2023-06-17,trx-352160720823,85466,Michael,Brown,michael.brown@yah...,5.0,2.0,16,Skirt,Clothing,8,39.99,no,Cash,David Wilson,2021-04-08,Los Angeles,2016-07-28,B,1,9,,,,,0.0


In [122]:
def merge_customer_name(df: DataFrame) -> DataFrame:
        return df.withColumn("customer_name", concat(col("customer_fname"), lit(" "),col("customer_lname"))) \
                 .drop("customer_fname", "customer_lname")

result_df = merge_customer_name(result_df)

In [127]:
state_dict = {
    'AZ': 'Arizona',
    'DC': 'District of Columbia',
    'KY': 'Kentucky',
    'CA': 'California',
    'CT': 'Connecticut',
    'VT': 'Vermont',
    'MD': 'Maryland',
    'AL': 'Alabama',
    'TN': 'Tennessee',
    'GA': 'Georgia',
    'MA': 'Massachusetts',
    'FL': 'Florida',
    'CO': 'Colorado',
    'AK': 'Alaska',
    'AR': 'Arkansas',
    'OK': 'Oklahoma',
    'Washington': 'Washington'  
}


from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, create_map, lit
from itertools import chain

def map_states(spark: SparkSession, df: DataFrame, state_dict: Dict[str, str]) -> DataFrame:
    # Broadcast the state dictionary
    broadcast_states = spark.sparkContext.broadcast(state_dict)
    
    # Create the mapping expression
    mapping_expr = when(col("shipping_state").isin(list(broadcast_states.value.keys())), 
                        col("shipping_state"))
    
    for abbr, full_name in broadcast_states.value.items():
        mapping_expr = mapping_expr.when(col("shipping_state") == abbr, lit(full_name))
    
    mapping_expr = mapping_expr.otherwise(col("shipping_state"))
    
    # Apply the mapping to create the new shipping_state_full column
    return df.withColumn("shipping_state_full", mapping_expr)

result_df2 = map_states(spark, result_df, state_dict)

result_df2.filter(result_df2.is_online.isin('yes')).show(2, False, True)


-RECORD 0----------------------------------------
 transaction_date      | 2022-11-24              
 transaction_id        | trx-630807021567        
 customer_id           | 85541                   
 cusomter_email        | ava.jones@hotmail.com:  
 sales_agent_id        | null                    
 branch_id             | null                    
 product_id            | 27                      
 product_name          | Iron                    
 product_category      | Appliances              
 units                 | 6                       
 unit_price            | 29.99                   
 is_online             | yes                     
 payment_method        | Credit Card             
 sales_agent_name      | null                    
 sales_agent_hire_date | null                    
 branch_location       | null                    
 branch_establish_date | null                    
 branch_class          | null                    
 group                 | 1                       


In [101]:
result_df_2.filter(result_df_2.is_online.isin('yes')).show(2, False, True)

-RECORD 0----------------------------------------
 transaction_date      | 2022-11-24              
 transaction_id        | trx-630807021567        
 customer_id           | 85541                   
 cusomter_email        | ava.jones@hotmail.com:  
 sales_agent_id        | null                    
 branch_id             | null                    
 product_id            | 27                      
 product_name          | Iron                    
 product_category      | Appliances              
 units                 | 6                       
 unit_price            | 29.99                   
 is_online             | yes                     
 payment_method        | Credit Card             
 sales_agent_name      | null                    
 sales_agent_hire_date | null                    
 branch_location       | null                    
 branch_establish_date | null                    
 branch_class          | null                    
 group                 | 1                       


In [131]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, regexp_extract, when
import re

def validate_email(spark: SparkSession, df: DataFrame, email_column: str = 'cusomter_email') -> DataFrame:
    # Email regex pattern
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    
    # Broadcast the pattern
    broadcast_pattern = spark.sparkContext.broadcast(email_pattern)
    
    # Function to validate email
    def is_valid_email(email):
        if email is None:
            return False
        return bool(re.match(broadcast_pattern.value, email))
    
    # Register the UDF
    spark.udf.register("is_valid_email", is_valid_email)
    
    # Apply the validation
    validated_df = df.withColumn(
        f"valid_{email_column}",
        when(col(email_column).isNull(), None)
        .otherwise(
            when(regexp_extract(col(email_column), broadcast_pattern.value, 0) != '', col(email_column))
            .otherwise(None)
        )
    )
    
    return validated_df


In [132]:
result_df3 = validate_email(spark, result_df2)

In [133]:
result_df3

transaction_date,transaction_id,customer_id,cusomter_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,discount,customer_name,shipping_state_full,valid_cusomter_email
2023-05-20,trx-152546429674,85469,alexander.brown@g...,1.0,2.0,22,Coffee Maker,Appliances,10,79.99,no,Cash,John Doe,2020-06-03,Los Angeles,2016-07-28,B,1,0,,,,,0.0,Alexander Brown,,
2022-10-25,trx-291375327542,85512,william.brown@gma...,3.0,1.0,24,Blender,Appliances,5,49.99,no,Cash,Michael Johnson,2021-10-03,New York,2017-01-15,A,1,1,,,,,0.2,William Brown,,
2022-02-05,trx-312507679871,85484,john.williams@gma...,10.0,3.0,4,Headphones,Electronics,1,99.99,no,Credit Card,Sophia Moore,2019-05-25,Chicago,2015-03-10,A,1,2,,,,,0.0,John Williams,,
2023-10-20,trx-193384855491,85528,alexander.miller@...,7.0,2.0,25,Washing Machine,Appliances,8,499.99,no,Cash,Christopher Miller,2020-01-11,Los Angeles,2016-07-28,B,1,3,,,,,0.0,Alexander Miller,,
2022-11-17,trx-831626097654,85500,john.brown@hotmai...,5.0,1.0,14,Camera,Electronics,10,399.99,no,Cash,David Wilson,2021-04-08,New York,2017-01-15,A,1,4,,,,,0.15,John Brown,,
2022-09-27,trx-158496122054,85545,sophia.wilson@hot...,4.0,5.0,14,Camera,Electronics,6,399.99,no,Credit Card,Emily Brown,2020-10-25,Phoenix,2017-09-20,C,1,5,,,,,0.2,Sophia Wilson,,
2022-04-21,trx-722817999024,85561,alexander.moore@y...,4.0,1.0,30,Electric Kettle,Appliances,6,24.99,no,Credit Card,Emily Brown,2020-10-25,New York,2017-01-15,A,1,6,,,,,0.2,Alexander Moore,,
2023-04-28,trx-813287633702,85520,alexander.wilson@...,1.0,1.0,26,Vacuum Cleaner,Appliances,4,199.99,no,Cash,John Doe,2020-06-03,New York,2017-01-15,A,1,7,,,,,0.0,Alexander Wilson,,
2023-03-08,trx-219568257432,85488,michael.miller@ya...,6.0,2.0,18,Boots,Footwear,10,149.99,no,Credit Card,Emma Taylor,2019-03-28,Los Angeles,2016-07-28,B,1,8,,,,,0.0,Michael Miller,,
2023-06-17,trx-352160720823,85466,michael.brown@yah...,5.0,2.0,16,Skirt,Clothing,8,39.99,no,Cash,David Wilson,2021-04-08,Los Angeles,2016-07-28,B,1,9,,,,,0.0,Michael Brown,,


In [146]:
from pyspark.sql.functions import lower, regexp_replace, when, col, lit
from pyspark.sql.types import StringType
from pyspark.sql import DataFrame, SparkSession
import re

def clean_and_validate_email(spark: SparkSession, df: DataFrame, email_column: str = 'cusomter_email') -> DataFrame:
    # First, clean the email addresses
    df_cleaned = df.withColumn(
        f"cleaned_{email_column}",
        regexp_replace(lower(col(email_column)), r"\s+", "")  # Remove whitespace
    )

    # Email regex pattern
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    
    # Broadcast the pattern
    broadcast_pattern = spark.sparkContext.broadcast(email_pattern)
    
    # Function to validate email
    def validate_email(email):
        if email is None or email.strip() == "":
            return "Empty email"
        if '@' not in email:
            return "Missing @"
        if not re.match(broadcast_pattern.value, email):
            return "Invalid format"
        return "Valid"
    
    # Register the UDF
    validate_email_udf = spark.udf.register("validate_email", validate_email, StringType())
    
    # Apply the validation
    df_validated = df_cleaned.withColumn(
        f"email_status_{email_column}",
        validate_email_udf(col(f"cleaned_{email_column}"))
    )
    
    # Create a column with valid emails or null for invalid ones
    df_final = df_validated.withColumn(
        f"valid_{email_column}",
        when(col(f"email_status_{email_column}") == "Valid", col(f"cleaned_{email_column}")).otherwise(lit(None))
    )
    
    return df_final

# Usage
spark = SparkSession.builder.appName("EmailValidation").getOrCreate()
result_df3 = clean_and_validate_email(spark, result_df2)

# Show the results
result_df3.select('cusomter_email', f'cleaned_cusomter_email', f'email_status_cusomter_email', f'valid_cusomter_email').show(truncate=False)

+------------------------------+------------------------------+---------------------------+--------------------+
|cusomter_email                |cleaned_cusomter_email        |email_status_cusomter_email|valid_cusomter_email|
+------------------------------+------------------------------+---------------------------+--------------------+
|alexander.brown@gmail.com""   |alexander.brown@gmail.com""   |Invalid format             |null                |
|william.brown@gmail.com;      |william.brown@gmail.com;      |Invalid format             |null                |
|john.williams@gmail.com*      |john.williams@gmail.com*      |Invalid format             |null                |
|alexander.miller@yahoo.com#4r |alexander.miller@yahoo.com#4r |Invalid format             |null                |
|john.brown@hotmail.com;       |john.brown@hotmail.com;       |Invalid format             |null                |
|sophia.wilson@hotmail.com)x   |sophia.wilson@hotmail.com)x   |Invalid format             |null 

In [149]:
result_df = result_df.withColumnRenamed('cusomter_email', 'customer_email')

In [157]:
online_df = result_df.filter("is_online == 'yes'").drop("sales_agent_id", "branch_id", "sales_agent_name", 
                                                         "sales_agent_hire_date", "branch_location",
                                                         "branch_establish_date", "branch_class")
offline_df = result_df.filter("is_online == 'no'").drop("shipping_street_name", "shipping_city", 
                                                        "shipping_state", "shipping_zip_code")

In [164]:
offline_df.coalesce(1)
online_df.coalesce(1)

transaction_date,transaction_id,customer_id,customer_email,product_id,product_name,product_category,units,unit_price,is_online,payment_method,group,row_index,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,discount,customer_name
2022-11-24,trx-630807021567,85541,ava.jones@hotmail...,27,Iron,Appliances,6,29.99,yes,Credit Card,1,0,43 Henderson Avenue,Savannah,GA,31406,0.0,Ava Jones
2023-01-20,trx-231766088020,85550,emma.taylor@gmail...,8,Sneakers,Footwear,6,79.99,yes,PayPal,1,1,8 Linden Circle,Somerville,MA,2143,0.0,Emma Taylor
2022-10-18,trx-334836612567,85479,james.jones@outlo...,6,Jeans,Clothing,2,49.99,yes,Stripe,1,2,1139 Addison Street,Berkeley,CA,94702,0.15,James Jones
2022-04-06,trx-678424403739,85551,michael.johnson@y...,10,Sandals,Footwear,3,39.99,yes,Credit Card,1,3,521 McGregor Court,Montgomery,AL,36117,0.0,Michael Johnson
2022-08-09,trx-034026221655,85470,ava.wilson@hotmai...,3,Tablet,Electronics,6,299.99,yes,Stripe,1,4,10802 Brickside C...,Riverview,FL,33579,0.0,Ava Wilson
2023-10-27,trx-334049470367,85507,ava.miller@outloo...,25,Washing Machine,Appliances,5,499.99,yes,Credit Card,1,5,131 Westerly Street,Manchester,CT,6042,0.2,Ava Miller
2023-03-14,trx-354281638130,85466,michael.brown@yah...,16,Skirt,Clothing,9,39.99,yes,PayPal,1,6,5403 Illinois Avenue,Nashville,TN,37209,0.0,Michael Brown
2023-11-06,trx-919712049046,85493,james.johnson@gma...,27,Iron,Appliances,6,29.99,yes,Credit Card,1,7,10304 North 179th...,Waddell,AZ,85355,0.2,James Johnson
2023-02-17,trx-947364728949,85550,emma.taylor@gmail...,20,Heels,Footwear,9,59.99,yes,Stripe,1,8,10340 West 62nd P...,Arvada,CO,80004,0.0,Emma Taylor
2023-01-12,trx-624869757221,85499,sophia.miller@hot...,3,Tablet,Electronics,1,299.99,yes,PayPal,1,9,629 Cutter Court,Annapolis,MD,21401,0.0,Sophia Miller


In [166]:
offline_df.write.csv("file:///data/dd/offline.csv")

In [167]:
online_df.write.csv("file:///data/dd/online.csv")

In [168]:
spark.stop()