In [195]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from datetime import datetime

# Configuration
class Config:
    RAW_BASE_PATH = "/user/itversity/q-company_raw_layer"
    STANDARDIZED_BASE_PATH = "/user/itversity/q-company_standardized_layer"
    
class Schemas:
    offline_transactions = StructType([
        StructField("transaction_date", DateType(), nullable=False),
        StructField("transaction_id", StringType(), nullable=False),
        StructField("customer_id", LongType(), nullable=False),
        StructField("customer_name", StringType(), nullable=False),
        StructField("customer_email", StringType(), nullable=False),
        StructField("sales_agent_id", LongType(), nullable=False),
        StructField("branch_id", LongType(), nullable=False),
        StructField("product_id", LongType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("product_category", StringType(), nullable=False),
        StructField("units", IntegerType(), nullable=False),
        StructField("unit_price", DoubleType(), nullable=False),
        StructField("discount", FloatType(), nullable=False),
        StructField("total_price", DoubleType(), nullable=False),
        StructField("payment_method", StringType(), nullable=False),
        StructField("sales_agent_name", StringType(), nullable=False),
        StructField("sales_agent_hire_date", DateType(), nullable=False),
        StructField("branch_location", StringType(), nullable=False),
        StructField("branch_establish_date", DateType(), nullable=False),
        StructField("branch_class", StringType(), nullable=False),
        StructField("group", StringType(), nullable=False)
    ])

    online_transactions = StructType([
        StructField("transaction_date", DateType(), nullable=False),
        StructField("transaction_id", StringType(), nullable=False),
        StructField("customer_id", LongType(), nullable=False),
        StructField("customer_name", StringType(), nullable=False),
        StructField("customer_email", StringType(), nullable=False),
        StructField("product_id", LongType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("product_category", StringType(), nullable=False),
        StructField("units", IntegerType(), nullable=False),
        StructField("unit_price", DoubleType(), nullable=False),
        StructField("discount", FloatType(), nullable=False),
        StructField("total_price", DoubleType(), nullable=False),
        StructField("payment_method", StringType(), nullable=False),
        StructField("shipping_street_name",  StringType(), nullable=False), 
        StructField("shipping_city",  StringType(), nullable=False),
        StructField("shipping_state",  StringType(), nullable=False),
        StructField("shipping_zip_code",  StringType(), nullable=False),
        StructField("group", StringType(), nullable=False)
    ])
    
# Utility functions
class HDFSUtils:
    @staticmethod
    def get_latest_file(spark: SparkSession, hdfs_path: str) -> str:
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
            .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        sorted_files = sorted(files, key=lambda f: f.getModificationTime(), reverse=True)
        return sorted_files[0].getPath().toString() if sorted_files else None
    

In [218]:
from pyspark.sql import SparkSession, DataFrame
from typing import Dict, List, Callable
from datetime import datetime
from utils import HDFSUtils
import os

# Data reading
class DataReader:
    @staticmethod
    def read_latest_csv(spark: SparkSession, base_path: str, num_partitions: int = 200) -> DataFrame:
        current_date = datetime.now().strftime("%Y-%m-%d")
        hdfs_path = f"{base_path}/raw_sales_transactions_{current_date}"
        latest_file = HDFSUtils.get_latest_file(spark, hdfs_path)
        max_retries = 5
        initial_wait_time = 5
        
        if latest_file:
            print(f"Processing file: {latest_file}")
            for attempt in range(max_retries):
                try:
                    # Read the CSV file with header and repartition
                    df = spark.read.option("header", "true").csv(latest_file)
                    repartitioned_df = df.repartition(num_partitions)
                    print(f"Successfully read CSV and partitioned to {num_partitions}")
                    return repartitioned_df
                except Exception as e:
                    wait_time = initial_wait_time * (2 ** attempt)
                    print(f"Attempt {attempt + 1} failed with error: {str(e)}")
                    print("Full stack trace:")
                    traceback.print_exc()
                    if attempt + 1 < max_retries:
                        print(f"Retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                    else:
                        print("Max retries reached. Could not read the CSV file.")
                        return None
        else:
            print(f"No files found in {hdfs_path}")
            return None
        
# Data Writer
class DataWriter:
    @staticmethod
    def write_parquet(spark: SparkSession, df: DataFrame, base_path: str, transaction_type: str, partition_cols: List[str]) -> None:
        current_date = datetime.now().strftime("%Y-%m-%d")
        standardized_dir = f"standardized_sales_transaction_{current_date}"
        full_path = os.path.join(base_path, standardized_dir)

        # Check if the standardized directory for the current day exists, create it if not
        fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
        if not fs.exists(spark._jvm.org.apache.hadoop.fs.Path(full_path)):
            fs.mkdirs(spark._jvm.org.apache.hadoop.fs.Path(full_path))
            print(f"Created directory: {full_path}")
        else:
            print(f"Directory already exists: {full_path}")

        # Generate timestamp
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

        result_string = ''
        try:
            first_group = split_dfs['online'].select("group").filter("group is not null").first()[0]
            result_string = str(first_group)
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            result_string = None            
        
        file_name = f"{transaction_type}_transactions_{result_string}_{timestamp}"
        group_path = os.path.join(full_path, file_name)

        df.write \
            .partitionBy(partition_cols) \
            .mode("overwrite") \
            .parquet(group_path)

        print(f"Written {transaction_type} transactions for group {result_string} to {group_path}")

In [205]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import Dict, List, Callable
from datetime import datetime
import re

# Data transformation functions
class DataTransformer:
    @staticmethod
    def rename_columns(df: DataFrame) -> DataFrame:
        return df.withColumnRenamed("name", "sales_agent_name") \
                 .withColumnRenamed("hire_date", "sales_agent_hire_date") \
                 .withColumnRenamed("location", "branch_location") \
                 .withColumnRenamed("establish_date", "branch_establish_date") \
                 .withColumnRenamed("class", "branch_class") \
                 .withColumnRenamed("cusomter_email", "customer_email") \
                 .withColumnRenamed("cusomter_lname", "customer_lname")

    @staticmethod
    def remove_blank_columns(df: DataFrame) -> DataFrame:
        return df.select([c for c in df.columns if df.filter(col(c).isNotNull()).count() > 0])

    @staticmethod
    def map_offers_to_discount(spark: SparkSession, df: DataFrame, offers_dict: Dict[str, float]) -> DataFrame:
        broadcast_offers = spark.sparkContext.broadcast(offers_dict)
        offer_columns = ["offer_1", "offer_2", "offer_3", "offer_4", "offer_5"]
        
        def create_offer_column(offer_col: str):
            return when(col(offer_col) == lit(True), lit(broadcast_offers.value[offer_col]))
        
        offer_discount_columns = [create_offer_column(offer_col).alias(f"{offer_col}_discount") for offer_col in offer_columns]
        df_with_offer_discounts = df.select("*", *offer_discount_columns)
        discount_column = coalesce(*[col(f"{offer_col}_discount") for offer_col in offer_columns], lit(broadcast_offers.value["null"]))
        
        return df_with_offer_discounts.withColumn("discount", discount_column.cast(FloatType())) \
                                      .drop(*[f"{offer_col}_discount" for offer_col in offer_columns]) \
                                      .drop(*offer_columns)

    @staticmethod
    def merge_customer_name(df: DataFrame) -> DataFrame:
        return df.withColumn("customer_name", concat(col("customer_fname"), lit(" "), col("customer_lname"))) \
                 .drop("customer_fname", "customer_lname")

    @staticmethod
    @udf(returnType=StringType())
    def clean_email(email: str) -> str:
        if email is None:
            return None
        email = email.strip()
        com = email.rfind('.')
        email = email[:com+1] + "com"
        email = re.sub(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}).*', r'\1', email)
        return email if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email) else None

    @staticmethod
    @udf(returnType=StringType())
    def validate_transaction_id(trx_id: str) -> str:
        if trx_id is None:
            return None
        trx_id = trx_id.strip()
        numeric_part = re.sub(r'\D', '', trx_id)
        return f"trx-{numeric_part}" if numeric_part else None

    @staticmethod
    @udf(returnType=DoubleType())
    def validate_unit_price(price):
        return price if price >= 0 else (-1 * price)
    
    @staticmethod    
    def rearrange_columns(df: DataFrame) -> DataFrame:
        new_order = [
            'transaction_id', 'transaction_date', 'customer_id', 'customer_name', 'customer_email',
            'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
            'payment_method', 'group', 'is_online', 'sales_agent_id', 'sales_agent_name',
            'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
            'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
        ]
        return df.select(new_order)

    @staticmethod
    def convert_dates_to_date_type(df: DataFrame) -> DataFrame:
        return df.withColumn('transaction_date', to_date(col('transaction_date'))) \
                 .withColumn('branch_establish_date', to_date(col('branch_establish_date'))) \
                 .withColumn('sales_agent_hire_date', to_date(col('sales_agent_hire_date')))

    @staticmethod
    def convert_ids_to_long_type(df: DataFrame) -> DataFrame:
        return df.withColumn('sales_agent_id', col('sales_agent_id').cast(LongType())) \
                 .withColumn('branch_id', col('branch_id').cast(LongType()))

    @staticmethod
    def split_shipping_address(df: DataFrame) -> DataFrame:
        return df.withColumn("shipping_address_split", split(col("shipping_address"), "/")) \
                 .withColumn("shipping_street_name", col("shipping_address_split")[0]) \
                 .withColumn("shipping_city", col("shipping_address_split")[1]) \
                 .withColumn("shipping_state", col("shipping_address_split")[2]) \
                 .withColumn("shipping_zip_code", col("shipping_address_split")[3]) \
                 .drop("shipping_address", "shipping_address_split")
    
    @staticmethod
    def calculate_total_price(df: DataFrame) -> DataFrame:
        return df.withColumn("total_price", round(col("units") * col("unit_price") * (1 - col("discount")), 3))

    @staticmethod
    def map_shipping_state(spark: SparkSession, df: DataFrame, state_dict: Dict[str, str]) -> DataFrame:
        broadcast_dict = spark.sparkContext.broadcast(state_dict)
        conditions = coalesce(*[when(col("shipping_state") == key, lit(value)) for key, value in broadcast_dict.value.items()])
        return df.withColumn("shipping_state_mapped", when(conditions.isNotNull(), conditions).otherwise(col("shipping_state"))) \
                 .drop("shipping_state") \
                 .withColumnRenamed("shipping_state_mapped", "shipping_state")

In [198]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import Dict, List, Callable
from datetime import datetime
from transformation import DataTransformer
import re

# Data Quality Layer
class DataQualityLayer:
    def __init__(self, spark: SparkSession):
        self.spark = spark
        self.state_dict = {
            'AZ': 'Arizona', 'DC': 'District of Columbia', 'KY': 'Kentucky',
            'CA': 'California', 'CT': 'Connecticut', 'VT': 'Vermont',
            'MD': 'Maryland', 'AL': 'Alabama', 'TN': 'Tennessee',
            'GA': 'Georgia', 'MA': 'Massachusetts', 'FL': 'Florida',
            'CO': 'Colorado', 'AK': 'Alaska', 'AR': 'Arkansas',
            'OK': 'Oklahoma', 'Washington': 'Washington'
        }
        self.offers_dict = {
            "null": 0.0, "offer_1": 0.05, "offer_2": 0.1,
            "offer_3": 0.15, "offer_4": 0.20, "offer_5": 0.25
        }

    def split_online_offline(self, df: DataFrame) -> Dict[str, DataFrame]:
        return {
            "online": df.filter(col("is_online") == "yes").drop("is_online"),
            "offline": df.filter(col("is_online") == "no").drop("is_online")
        }

    def apply_common_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            DataTransformer.rename_columns,
            DataTransformer.remove_blank_columns,
            lambda df: DataTransformer.map_offers_to_discount(self.spark, df, self.offers_dict),
            DataTransformer.merge_customer_name,
            lambda df: df.withColumn("customer_email", DataTransformer.clean_email(col("customer_email"))),
            lambda df: df.withColumn("transaction_id", DataTransformer.validate_transaction_id(col("transaction_id"))),
            DataTransformer.calculate_total_price
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def apply_offline_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            DataTransformer.convert_dates_to_date_type,
            DataTransformer.convert_ids_to_long_type,
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def apply_online_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            lambda df: df.withColumn('transaction_date', to_date(col('transaction_date'))),
            DataTransformer.split_shipping_address,
            lambda df: DataTransformer.map_shipping_state(self.spark, df, self.state_dict),
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def add_row_index(self, df: DataFrame) -> DataFrame:
        return df.withColumn("row_index", monotonically_increasing_id())

In [199]:
spark = SparkSession.builder.appName("DataQualityLayer").getOrCreate()

In [200]:
raw_df = DataReader.read_latest_csv(spark, Config.RAW_BASE_PATH, 50)

Processing file: hdfs://localhost:9000/user/itversity/q-company_raw_layer/raw_sales_transactions_2024-07-10/group2_merged_sales_transactions_20240711_000534.csv
Successfully read CSV and partitioned to 50


In [92]:
raw_df.count()

1500

In [153]:
df = DataTransformer.rename_columns(raw_df)

In [154]:
df = DataTransformer.remove_blank_columns(df)

In [202]:
df = DataTransformer.map_offers_to_discount(spark, raw_df, dq_layer.offers_dict)

In [156]:
df = DataTransformer.merge_customer_name(df)

In [203]:
df

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,is_online,payment_method,shipping_address,name,hire_date,location,establish_date,class,group,discount
2022-2-26,trx-100226527334,85523,Michael,Smith,michael.smith@yah...,,,9,Boots,Footwear,4,129.99,yes,PayPal,6717 North 59th A...,,,,,,group2,0.25
2023-4-7,trx-366002476938,85487,Olivia,Smith,olivia.smith@hotm...,5.0,3.0,16,Skirt,Clothing,2,39.99,no,Cash,,David Wilson,2021-4-8,Chicago,2015-03-10,A,group2,0.25
2023-11-24,trx-343713769120,85537,Michael,Brown,michael.brown@yah...,6.0,3.0,18,Boots,Footwear,5,149.99,no,Credit Card,,Emma Taylor,2019-3-28,Chicago,2015-03-10,A,group2,0.0
2023-12-11,trx-922965728140,85508,Ava,Brown,ava.brown@gmail.c...,,,22,Coffee Maker,Appliances,3,79.99,yes,Stripe,22219 Panama City...,,,,,,group2,0.0
2022-8-23,trx-438006840866,85475,William,Taylor,william.taylor@gm...,,,26,Vacuum Cleaner,Appliances,5,199.99,yes,Stripe,1801 Dodge Trail/...,,,,,,group2,0.15
2022-3-20,trx-825468820862,85521,Mia,Wilson,mia.wilson@yahoo....,2.0,2.0,11,TV,Electronics,1,899.99,no,Credit Card,,Jane Smith,2018-5-13,Los Angeles,2016-07-28,B,group2,0.05
2022-5-3,trx-484141713642,85482,John,Taylor,john.taylor@hotma...,,,27,Iron,Appliances,8,29.99,yes,Stripe,64 Roseberry Circ...,,,,,,group2,0.25
2022-3-13,trx-087113777656,85510,Alexander,Brown,alexander.brown@y...,9.0,5.0,5,T-Shirt,Clothing,10,19.99,no,Credit Card,,Daniel Martinez,2018-10-8,Phoenix,2017-09-20,C,group2,0.0
2023-3-25,trx-339433590271,85549,William,Johnson,william.johnson@y...,,,18,Boots,Footwear,7,149.99,yes,Stripe,215 9th Street No...,,,,,,group2,0.2
2022-8-8,trx-896156002421,85464,Michael,Miller,michael.miller@ou...,7.0,2.0,27,Iron,Appliances,5,29.99,no,Credit Card,,Christopher Miller,2020-1-11,Los Angeles,2016-07-28,B,group2,0.2


In [157]:
df = df.withColumn("customer_email", DataTransformer.clean_email(col("customer_email")))

In [158]:
df = df.withColumn("transaction_id", DataTransformer.validate_transaction_id(col("transaction_id")))

In [206]:
DataTransformer.calculate_total_price(df)

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,is_online,payment_method,shipping_address,name,hire_date,location,establish_date,class,group,discount,total_price
2022-2-26,trx-100226527334,85523,Michael,Smith,michael.smith@yah...,,,9,Boots,Footwear,4,129.99,yes,PayPal,6717 North 59th A...,,,,,,group2,0.25,389.97
2023-4-7,trx-366002476938,85487,Olivia,Smith,olivia.smith@hotm...,5.0,3.0,16,Skirt,Clothing,2,39.99,no,Cash,,David Wilson,2021-4-8,Chicago,2015-03-10,A,group2,0.25,59.985
2023-11-24,trx-343713769120,85537,Michael,Brown,michael.brown@yah...,6.0,3.0,18,Boots,Footwear,5,149.99,no,Credit Card,,Emma Taylor,2019-3-28,Chicago,2015-03-10,A,group2,0.0,749.95
2023-12-11,trx-922965728140,85508,Ava,Brown,ava.brown@gmail.c...,,,22,Coffee Maker,Appliances,3,79.99,yes,Stripe,22219 Panama City...,,,,,,group2,0.0,239.97
2022-8-23,trx-438006840866,85475,William,Taylor,william.taylor@gm...,,,26,Vacuum Cleaner,Appliances,5,199.99,yes,Stripe,1801 Dodge Trail/...,,,,,,group2,0.15,849.958
2022-3-20,trx-825468820862,85521,Mia,Wilson,mia.wilson@yahoo....,2.0,2.0,11,TV,Electronics,1,899.99,no,Credit Card,,Jane Smith,2018-5-13,Los Angeles,2016-07-28,B,group2,0.05,854.99
2022-5-3,trx-484141713642,85482,John,Taylor,john.taylor@hotma...,,,27,Iron,Appliances,8,29.99,yes,Stripe,64 Roseberry Circ...,,,,,,group2,0.25,179.94
2022-3-13,trx-087113777656,85510,Alexander,Brown,alexander.brown@y...,9.0,5.0,5,T-Shirt,Clothing,10,19.99,no,Credit Card,,Daniel Martinez,2018-10-8,Phoenix,2017-09-20,C,group2,0.0,199.9
2023-3-25,trx-339433590271,85549,William,Johnson,william.johnson@y...,,,18,Boots,Footwear,7,149.99,yes,Stripe,215 9th Street No...,,,,,,group2,0.2,839.944
2022-8-8,trx-896156002421,85464,Michael,Miller,michael.miller@ou...,7.0,2.0,27,Iron,Appliances,5,29.99,no,Credit Card,,Christopher Miller,2020-1-11,Los Angeles,2016-07-28,B,group2,0.2,119.96


In [165]:
df

transaction_date,transaction_id,customer_id,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,is_online,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,discount,customer_name,total_price
2022-2-26,trx-100226527334,85523,michael.smith@yah...,,,9,Boots,Footwear,4,129.99,yes,PayPal,6717 North 59th A...,,,,,,group2,0.25,Michael Smith,389.97
2023-4-7,trx-366002476938,85487,olivia.smith@hotm...,5.0,3.0,16,Skirt,Clothing,2,39.99,no,Cash,,David Wilson,2021-4-8,Chicago,2015-03-10,A,group2,0.25,Olivia Smith,59.985
2023-11-24,trx-343713769120,85537,michael.brown@yah...,6.0,3.0,18,Boots,Footwear,5,149.99,no,Credit Card,,Emma Taylor,2019-3-28,Chicago,2015-03-10,A,group2,0.0,Michael Brown,749.95
2023-12-11,trx-922965728140,85508,ava.brown@gmail.com,,,22,Coffee Maker,Appliances,3,79.99,yes,Stripe,22219 Panama City...,,,,,,group2,0.0,Ava Brown,239.97
2022-8-23,trx-438006840866,85475,william.taylor@gm...,,,26,Vacuum Cleaner,Appliances,5,199.99,yes,Stripe,1801 Dodge Trail/...,,,,,,group2,0.15,William Taylor,849.958
2022-3-20,trx-825468820862,85521,mia.wilson@yahoo.com,2.0,2.0,11,TV,Electronics,1,899.99,no,Credit Card,,Jane Smith,2018-5-13,Los Angeles,2016-07-28,B,group2,0.05,Mia Wilson,854.99
2022-5-3,trx-484141713642,85482,john.taylor@hotma...,,,27,Iron,Appliances,8,29.99,yes,Stripe,64 Roseberry Circ...,,,,,,group2,0.25,John Taylor,179.94
2022-3-13,trx-087113777656,85510,alexander.brown@y...,9.0,5.0,5,T-Shirt,Clothing,10,19.99,no,Credit Card,,Daniel Martinez,2018-10-8,Phoenix,2017-09-20,C,group2,0.0,Alexander Brown,199.9
2023-3-25,trx-339433590271,85549,william.johnson@y...,,,18,Boots,Footwear,7,149.99,yes,Stripe,215 9th Street No...,,,,,,group2,0.2,William Johnson,839.944
2022-8-8,trx-896156002421,85464,michael.miller@ou...,7.0,2.0,27,Iron,Appliances,5,29.99,no,Credit Card,,Christopher Miller,2020-1-11,Los Angeles,2016-07-28,B,group2,0.2,Michael Miller,119.96


In [166]:
df_splits = dq_layer.split_online_offline(df)

In [167]:
DataTransformer.convert_dates_to_date_type(df_splits['online'])

transaction_date,transaction_id,customer_id,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,discount,customer_name,total_price
2022-12-10,trx-514265914599,85536,sophia.jones@hotm...,,,23,Toaster,Appliances,2,39.99,PayPal,435 Benita Drive/...,,,,,,group2,0.05,Sophia Jones,75.981
2022-03-22,trx-462457335449,85498,olivia.brown@outl...,,,22,Coffee Maker,Appliances,4,79.99,Credit Card,7785 Montgomery M...,,,,,,group2,0.2,Olivia Brown,255.968
2023-07-23,trx-834189400703,85532,james.smith@yahoo...,,,13,Printer,Electronics,6,149.99,Credit Card,8376 Albacore Dri...,,,,,,group2,0.2,James Smith,719.952
2023-02-27,trx-520346742720,85476,emma.taylor@outlo...,,,17,Blouse,Clothing,10,29.99,Stripe,7419 West Hill La...,,,,,,group2,0.15,Emma Taylor,254.915
2022-08-10,trx-778091208972,85539,emma.davis@hotmai...,,,19,Sandals,Footwear,4,29.99,PayPal,90 Peabody Street...,,,,,,group2,0.1,Emma Davis,107.964
2022-08-13,trx-004528494647,85489,william.moore@yah...,,,6,Jeans,Clothing,3,49.99,Stripe,10841 Sutter Circ...,,,,,,group2,0.25,William Moore,112.477
2022-09-17,trx-142661381200,85484,olivia.brown@hotm...,,,16,Skirt,Clothing,2,39.99,Credit Card,683 North Wilson ...,,,,,,group2,0.2,Olivia Brown,63.984
2023-03-01,trx-706232987460,85545,john.wilson@gmail...,,,6,Jeans,Clothing,4,49.99,Stripe,6057 Griffith Ave...,,,,,,group2,0.0,John Wilson,199.96
2023-07-23,trx-424269763934,85513,james.davis@outlo...,,,17,Blouse,Clothing,3,29.99,Credit Card,32 Royal Drive/Lo...,,,,,,group2,0.25,James Davis,67.477
2023-07-01,trx-505504358640,85485,olivia.smith@hotm...,,,13,Printer,Electronics,9,149.99,Stripe,4216 Lorraine Str...,,,,,,group2,0.0,Olivia Smith,1349.91


In [168]:
DataTransformer.convert_ids_to_long_type(df_splits['online'])

transaction_date,transaction_id,customer_id,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,discount,customer_name,total_price
2022-12-10,trx-514265914599,85536,sophia.jones@hotm...,,,23,Toaster,Appliances,2,39.99,PayPal,435 Benita Drive/...,,,,,,group2,0.05,Sophia Jones,75.981
2022-3-22,trx-462457335449,85498,olivia.brown@outl...,,,22,Coffee Maker,Appliances,4,79.99,Credit Card,7785 Montgomery M...,,,,,,group2,0.2,Olivia Brown,255.968
2023-7-23,trx-834189400703,85532,james.smith@yahoo...,,,13,Printer,Electronics,6,149.99,Credit Card,8376 Albacore Dri...,,,,,,group2,0.2,James Smith,719.952
2023-2-27,trx-520346742720,85476,emma.taylor@outlo...,,,17,Blouse,Clothing,10,29.99,Stripe,7419 West Hill La...,,,,,,group2,0.15,Emma Taylor,254.915
2022-8-10,trx-778091208972,85539,emma.davis@hotmai...,,,19,Sandals,Footwear,4,29.99,PayPal,90 Peabody Street...,,,,,,group2,0.1,Emma Davis,107.964
2022-8-13,trx-004528494647,85489,william.moore@yah...,,,6,Jeans,Clothing,3,49.99,Stripe,10841 Sutter Circ...,,,,,,group2,0.25,William Moore,112.477
2022-9-17,trx-142661381200,85484,olivia.brown@hotm...,,,16,Skirt,Clothing,2,39.99,Credit Card,683 North Wilson ...,,,,,,group2,0.2,Olivia Brown,63.984
2023-3-1,trx-706232987460,85545,john.wilson@gmail...,,,6,Jeans,Clothing,4,49.99,Stripe,6057 Griffith Ave...,,,,,,group2,0.0,John Wilson,199.96
2023-7-23,trx-424269763934,85513,james.davis@outlo...,,,17,Blouse,Clothing,3,29.99,Credit Card,32 Royal Drive/Lo...,,,,,,group2,0.25,James Davis,67.477
2023-7-1,trx-505504358640,85485,olivia.smith@hotm...,,,13,Printer,Electronics,9,149.99,Stripe,4216 Lorraine Str...,,,,,,group2,0.0,Olivia Smith,1349.91


In [171]:
df_splits['online'] = dq_layer.apply_online_transformations(df_splits['online'])

In [172]:
df_splits['online'].printSchema()

root
 |-- transaction_date: date (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- sales_agent_id: string (nullable = true)
 |-- branch_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: string (nullable = true)
 |-- unit_price: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- sales_agent_name: string (nullable = true)
 |-- sales_agent_hire_date: string (nullable = true)
 |-- branch_location: string (nullable = true)
 |-- branch_establish_date: string (nullable = true)
 |-- branch_class: string (nullable = true)
 |-- group: string (nullable = true)
 |-- discount: float (nullable = false)
 |-- customer_name: string (nullable = true)
 |-- total_price: double (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- s

In [202]:
df_splits['online'].show(2, False, True)

-RECORD 0-----------------------------------------
 transaction_date      | 2023-01-01               
 transaction_id        | trx-332766751554         
 customer_id           | 85469                    
 customer_email        | john.taylor@yahoo..com   
 sales_agent_id        | null                     
 branch_id             | null                     
 product_id            | 17                       
 product_name          | Blouse                   
 product_category      | Clothing                 
 units                 | 8                        
 unit_price            | 29.99                    
 payment_method        | Credit Card              
 sales_agent_name      | null                     
 sales_agent_hire_date | null                     
 branch_location       | null                     
 branch_establish_date | null                     
 branch_class          | null                     
 group                 | group4                   
 discount              | 0.1   

In [173]:
df_splits['online'].select(Schemas.online_transactions.fieldNames())

transaction_date,transaction_id,customer_id,customer_name,customer_email,product_id,product_name,product_category,units,unit_price,discount,total_price,payment_method,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,group
2023-11-16,trx-747520842146,85491,Olivia Taylor,olivia.taylor@yah...,23,Toaster,Appliances,8,39.99,0.0,319.92,PayPal,8852 Broderick St...,Montgomery,Alabama,36117,group2
2023-03-20,trx-398386249338,85533,John Williams,john.williams@gma...,18,Boots,Footwear,1,149.99,0.0,149.99,Stripe,2502 Adrienne Way,Louisville,Kentucky,40216,group2
2023-12-05,trx-760463132741,85554,William Wilson,william.wilson@ya...,28,Hair Dryer,Appliances,2,19.99,0.0,39.98,Stripe,63 Dorchester Street,Worcester,Massachusetts,1604,group2
2023-11-16,trx-343079605633,85524,Alexander Miller,alexander.miller@...,25,Washing Machine,Appliances,2,499.99,0.05,949.981,PayPal,4516 Old Seward H...,Anchorage,Alaska,99503,group2
2022-04-04,trx-666714316389,85519,Ava Williams,ava.williams@yaho...,16,Skirt,Clothing,4,39.99,0.05,151.962,Credit Card,4444 Central Avenue,Fremont,California,94536,group2
2023-11-05,trx-388436866471,85510,Alexander Brown,alexander.brown@y...,16,Skirt,Clothing,5,39.99,0.0,199.95,Credit Card,185 Woodlawn Road,Burlington,Vermont,5408,group2
2023-02-25,trx-109095192203,85514,Sophia Johnson,sophia.johnson@gm...,12,Monitor,Electronics,10,299.99,0.1,2699.91,Stripe,53 Greenwood Avenue,Wakefield,Massachusetts,1880,group2
2022-09-16,trx-642436826706,85500,Sophia Miller,sophia.miller@hot...,23,Toaster,Appliances,6,39.99,0.15,203.949,Credit Card,140 South Hill Av...,Fayetteville,Arkansas,72701,group2
2023-01-16,trx-884788991461,85469,Michael Williams,michael.williams@...,15,Hoodie,Clothing,6,29.99,0.0,179.94,Stripe,3784 Milky Way Drive,Anchorage,Alaska,99517,group2
2023-07-03,trx-903740554657,85503,Olivia Johnson,olivia.johnson@ou...,10,Sandals,Footwear,5,39.99,0.0,199.95,Stripe,109 High Street,Manchester,Connecticut,6040,group2


In [174]:
df_splits['offline'] = dq_layer.apply_offline_transformations(df_splits['offline'])

In [175]:
df_splits['offline'].select(Schemas.offline_transactions.fieldNames())

transaction_date,transaction_id,customer_id,customer_name,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,discount,total_price,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group
2023-04-18,trx-301997428444,85480,Olivia Davis,olivia.davis@hotm...,6,4,25,Washing Machine,Appliances,8,499.99,0.15,3399.932,Credit Card,Emma Taylor,2019-03-28,Houston,2016-11-05,D,group2
2023-10-24,trx-661774073579,85550,Emma Wilson,emma.wilson@outlo...,6,4,2,Smartphone,Electronics,4,699.99,0.0,2799.96,Credit Card,Emma Taylor,2019-03-28,Houston,2016-11-05,D,group2
2023-08-15,trx-182895486748,85559,Alexander Moore,alexander.moore@h...,5,2,10,Sandals,Footwear,2,39.99,0.2,63.984,Credit Card,David Wilson,2021-04-08,Los Angeles,2016-07-28,B,group2
2023-01-10,trx-799242150112,85522,Sophia Williams,sophia.williams@o...,8,5,7,Dress,Clothing,4,59.99,0.1,215.964,Credit Card,Olivia Davis,2021-10-24,Phoenix,2017-09-20,C,group2
2023-03-22,trx-707989318319,85518,Ava Taylor,ava.taylor@gmail.com,4,5,8,Sneakers,Footwear,8,79.99,0.0,639.92,Credit Card,Emily Brown,2020-10-25,Phoenix,2017-09-20,C,group2
2023-07-19,trx-098609734580,85553,Mia Williams,mia.williams@hotm...,1,5,15,Hoodie,Clothing,1,29.99,0.05,28.49,Credit Card,John Doe,2020-06-03,Phoenix,2017-09-20,C,group2
2023-06-18,trx-620712256942,85487,Olivia Smith,olivia.smith@hotm...,2,5,9,Boots,Footwear,8,129.99,0.05,987.924,Cash,Jane Smith,2018-05-13,Phoenix,2017-09-20,C,group2
2022-12-12,trx-997075080204,85484,Olivia Brown,olivia.brown@hotm...,5,3,3,Tablet,Electronics,1,299.99,0.0,299.99,Credit Card,David Wilson,2021-04-08,Chicago,2015-03-10,A,group2
2022-07-11,trx-776170009574,85542,Ava Davis,ava.davis@yahoo.com,1,4,10,Sandals,Footwear,6,39.99,0.0,239.94,Cash,John Doe,2020-06-03,Houston,2016-11-05,D,group2
2022-06-28,trx-176960504329,85526,Sophia Smith,sophia.smith@hotm...,7,2,1,Laptop,Electronics,1,999.99,0.2,799.992,Cash,Christopher Miller,2020-01-11,Los Angeles,2016-07-28,B,group2


In [207]:
dq_layer = DataQualityLayer(spark)

In [208]:
transformed_df = dq_layer.apply_common_transformations(raw_df)

In [209]:
transformed_df.show(3, False, True)

-RECORD 0---------------------------------------------------------
 transaction_date      | 2022-2-26                                
 transaction_id        | trx-100226527334                         
 customer_id           | 85523                                    
 customer_email        | michael.smith@yahoo.com                  
 sales_agent_id        | null                                     
 branch_id             | null                                     
 product_id            | 9                                        
 product_name          | Boots                                    
 product_category      | Footwear                                 
 units                 | 4                                        
 unit_price            | 129.99                                   
 is_online             | yes                                      
 payment_method        | PayPal                                   
 shipping_address      | 6717 North 59th Avenue/Glendale/AZ/85

In [96]:
transformed_df.printSchema()

root
 |-- transaction_date: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- sales_agent_id: string (nullable = true)
 |-- branch_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: string (nullable = true)
 |-- unit_price: string (nullable = true)
 |-- is_online: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_address: string (nullable = true)
 |-- sales_agent_name: string (nullable = true)
 |-- sales_agent_hire_date: string (nullable = true)
 |-- branch_location: string (nullable = true)
 |-- branch_establish_date: string (nullable = true)
 |-- branch_class: string (nullable = true)
 |-- group: string (nullable = true)
 |-- discount: float (nullable = false)
 |-- customer_name: string (nullable = true)



In [219]:
split_dfs = dq_layer.split_online_offline(transformed_df)

In [220]:
split_dfs['online'].show(2, False, True)

-RECORD 0-----------------------------------------------------------
 transaction_date      | 2022-12-10                                 
 transaction_id        | trx-514265914599                           
 customer_id           | 85536                                      
 customer_email        | sophia.jones@hotmail.com                   
 sales_agent_id        | null                                       
 branch_id             | null                                       
 product_id            | 23                                         
 product_name          | Toaster                                    
 product_category      | Appliances                                 
 units                 | 2                                          
 unit_price            | 39.99                                      
 payment_method        | PayPal                                     
 shipping_address      | 435 Benita Drive/Nashville/TN/37211        
 sales_agent_name      | null     

In [212]:
split_dfs['offline'].show(2, False, True)

-RECORD 0--------------------------------------
 transaction_date      | 2023-10-14            
 transaction_id        | trx-818345757550      
 customer_id           | 85467                 
 customer_email        | emma.miller@yahoo.com 
 sales_agent_id        | 10.0                  
 branch_id             | 2.0                   
 product_id            | 14                    
 product_name          | Camera                
 product_category      | Electronics           
 units                 | 5                     
 unit_price            | 399.99                
 payment_method        | Credit Card           
 shipping_address      | null                  
 sales_agent_name      | Sophia Moore          
 sales_agent_hire_date | 2019-5-25             
 branch_location       | Los Angeles           
 branch_establish_date | 2016-07-28            
 branch_class          | B                     
 group                 | group2                
 discount              | 0.2            

In [80]:
try:
    first_group = split_dfs['online'].select("group").filter("group is not null").first()[0]
    result_string = str(first_group)
    print(result_string)
except Exception as e:
    print(f"An error occurred: {str(e)}")
    result_string = None

group6


In [221]:
for df_type, df in split_dfs.items():
        if df_type == "offline":
            df = dq_layer.apply_offline_transformations(df)
            df = df.select(Schemas.offline_transactions.fieldNames())
        else:  # online
            df = dq_layer.apply_online_transformations(df)
            df = df.select(Schemas.online_transactions.fieldNames())
        
        
        DataWriter.write_parquet(spark, df, Config.STANDARDIZED_BASE_PATH, df_type, ["transaction_date"])
        

Created directory: /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10
Written online transactions for group group2 to /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10/online_transactions_group2_20240710214406
Directory already exists: /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10
Written offline transactions for group group2 to /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10/offline_transactions_group2_20240710214459


In [214]:
split_dfs['offline'].show(2, False, True)

-RECORD 0--------------------------------------
 transaction_date      | 2023-10-14            
 transaction_id        | trx-818345757550      
 customer_id           | 85467                 
 customer_email        | emma.miller@yahoo.com 
 sales_agent_id        | 10.0                  
 branch_id             | 2.0                   
 product_id            | 14                    
 product_name          | Camera                
 product_category      | Electronics           
 units                 | 5                     
 unit_price            | 399.99                
 payment_method        | Credit Card           
 shipping_address      | null                  
 sales_agent_name      | Sophia Moore          
 sales_agent_hire_date | 2019-5-25             
 branch_location       | Los Angeles           
 branch_establish_date | 2016-07-28            
 branch_class          | B                     
 group                 | group2                
 discount              | 0.2            

In [215]:
split_dfs['online'].show(2, False, True)

-RECORD 0-----------------------------------------------------------
 transaction_date      | 2022-12-10                                 
 transaction_id        | trx-514265914599                           
 customer_id           | 85536                                      
 customer_email        | sophia.jones@hotmail.com                   
 sales_agent_id        | null                                       
 branch_id             | null                                       
 product_id            | 23                                         
 product_name          | Toaster                                    
 product_category      | Appliances                                 
 units                 | 2                                          
 unit_price            | 39.99                                      
 payment_method        | PayPal                                     
 shipping_address      | 435 Benita Drive/Nashville/TN/37211        
 sales_agent_name      | null     

In [222]:
spark.stop()