In [18]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from datetime import datetime

# Configuration
class Config:
    RAW_BASE_PATH = "/user/itversity/q-company_raw_layer"
    STANDARDIZED_BASE_PATH = "/user/itversity/q-company_standardized_layer"
    
class Schemas:
    offline_transactions = StructType([
        StructField("transaction_date", DateType(), nullable=False),
        StructField("transaction_id", StringType(), nullable=False),
        StructField("customer_id", LongType(), nullable=False),
        StructField("customer_name", StringType(), nullable=False),
        StructField("customer_email", StringType(), nullable=False),
        StructField("sales_agent_id", LongType(), nullable=False),
        StructField("branch_id", LongType(), nullable=False),
        StructField("product_id", LongType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("product_category", StringType(), nullable=False),
        StructField("units", IntegerType(), nullable=False),
        StructField("unit_price", DoubleType(), nullable=False),
        StructField("discount", FloatType(), nullable=False),
        StructField("payment_method", StringType(), nullable=False),
        StructField("sales_agent_name", StringType(), nullable=False),
        StructField("sales_agent_hire_date", DateType(), nullable=False),
        StructField("branch_location", StringType(), nullable=False),
        StructField("branch_establish_date", DateType(), nullable=False),
        StructField("branch_class", StringType(), nullable=False),
        StructField("group", StringType(), nullable=False)
    ])

    online_transactions = StructType([
        StructField("transaction_date", DateType(), nullable=False),
        StructField("transaction_id", StringType(), nullable=False),
        StructField("customer_id", LongType(), nullable=False),
        StructField("customer_name", StringType(), nullable=False),
        StructField("customer_email", StringType(), nullable=False),
        StructField("product_id", LongType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("product_category", StringType(), nullable=False),
        StructField("units", IntegerType(), nullable=False),
        StructField("unit_price", DoubleType(), nullable=False),
        StructField("discount", FloatType(), nullable=False),
        StructField("payment_method", StringType(), nullable=False),
        StructField("shipping_street_name",  StringType(), nullable=False), 
        StructField("shipping_city",  StringType(), nullable=False),
        StructField("shipping_state",  StringType(), nullable=False),
        StructField("shipping_zip_code",  StringType(), nullable=False),
        StructField("group", StringType(), nullable=False)
    ])
    
# Utility functions
class HDFSUtils:
    @staticmethod
    def get_latest_file(spark: SparkSession, hdfs_path: str) -> str:
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
            .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        sorted_files = sorted(files, key=lambda f: f.getModificationTime(), reverse=True)
        return sorted_files[0].getPath().toString() if sorted_files else None
    

In [19]:
from pyspark.sql import SparkSession, DataFrame
from typing import Dict, List, Callable
from datetime import datetime
from utils import HDFSUtils
import os

# Data reading
class DataReader:
    @staticmethod
    def read_latest_parquet(spark: SparkSession, base_path: str, num_partitions: int = 200) -> DataFrame:
        current_date = datetime.now().strftime("%Y-%m-%d")
        hdfs_path = f"{base_path}/raw_sales_transactions_{current_date}"
        latest_file = HDFSUtils.get_latest_file(spark, hdfs_path)
        max_retries = 5
        initial_wait_time = 5
        
        if latest_file:
            print(f"Processing file: {latest_file}")
            for attempt in range(max_retries):
                try:
                    # Read the Parquet file and repartition
                    df = spark.read.option("mergeSchema", "true").parquet(latest_file)
                    repartitioned_df = df.repartition(num_partitions)
                    print(f"Successfully read and repartitioned to {num_partitions} partitions")
                    return repartitioned_df
                except Exception as e:
                    wait_time = initial_wait_time * (2 ** attempt)
                    print(f"Attempt {attempt + 1} failed with error: {str(e)}")
                    print("Full stack trace:")
                    traceback.print_exc()
                    if attempt + 1 < max_retries:
                        print(f"Retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                    else:
                        print("Max retries reached. Could not read the parquet file.")
                        return None
        else:
            print(f"No files found in {hdfs_path}")
            return None

# Data Writer
class DataWriter:
    @staticmethod
    def write_parquet(spark: SparkSession, df: DataFrame, base_path: str, transaction_type: str, partition_cols: List[str]) -> None:
        current_date = datetime.now().strftime("%Y-%m-%d")
        standardized_dir = f"standardized_sales_transaction_{current_date}"
        full_path = os.path.join(base_path, standardized_dir)

        # Check if the standardized directory for the current day exists, create it if not
        fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
        if not fs.exists(spark._jvm.org.apache.hadoop.fs.Path(full_path)):
            fs.mkdirs(spark._jvm.org.apache.hadoop.fs.Path(full_path))
            print(f"Created directory: {full_path}")
        else:
            print(f"Directory already exists: {full_path}")

        # Generate timestamp
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

        group_number = df.select("group").distinct().collect()
            
        file_name = f"{transaction_type}_transactions_{group_number}_{timestamp}"
        group_path = os.path.join(full_path, file_name)

        df.write \
            .partitionBy(partition_cols) \
            .mode("overwrite") \
            .parquet(group_path)

        print(f"Written {transaction_type} transactions for group {group_number} to {group_path}")

In [3]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import Dict, List, Callable
from datetime import datetime
import re

# Data transformation functions
class DataTransformer:
    @staticmethod
    def rename_columns(df: DataFrame) -> DataFrame:
        return df.withColumnRenamed("name", "sales_agent_name") \
                 .withColumnRenamed("hire_date", "sales_agent_hire_date") \
                 .withColumnRenamed("location", "branch_location") \
                 .withColumnRenamed("establish_date", "branch_establish_date") \
                 .withColumnRenamed("class", "branch_class") \
                 .withColumnRenamed("cusomter_email", "customer_email") \
                 .withColumnRenamed("cusomter_lname", "customer_lname")

    @staticmethod
    def remove_blank_columns(df: DataFrame) -> DataFrame:
        return df.select([c for c in df.columns if df.filter(col(c).isNotNull()).count() > 0])

    @staticmethod
    def map_offers_to_discount(spark: SparkSession, df: DataFrame, offers_dict: Dict[str, float]) -> DataFrame:
        broadcast_offers = spark.sparkContext.broadcast(offers_dict)
        offer_columns = ["offer_1", "offer_2", "offer_3", "offer_4", "offer_5"]
        
        def create_offer_column(offer_col: str):
            return when(col(offer_col) == lit(True), lit(broadcast_offers.value[offer_col]))
        
        offer_discount_columns = [create_offer_column(offer_col).alias(f"{offer_col}_discount") for offer_col in offer_columns]
        df_with_offer_discounts = df.select("*", *offer_discount_columns)
        discount_column = coalesce(*[col(f"{offer_col}_discount") for offer_col in offer_columns], lit(broadcast_offers.value["null"]))
        
        return df_with_offer_discounts.withColumn("discount", discount_column.cast(FloatType())) \
                                      .drop(*[f"{offer_col}_discount" for offer_col in offer_columns]) \
                                      .drop(*offer_columns)

    @staticmethod
    def merge_customer_name(df: DataFrame) -> DataFrame:
        return df.withColumn("customer_name", concat(col("customer_fname"), lit(" "), col("customer_lname"))) \
                 .drop("customer_fname", "customer_lname")

    @staticmethod
    @udf(returnType=StringType())
    def clean_email(email: str) -> str:
        if email is None:
            return None
        email = email.strip()
        com = email.rfind('.')
        email = email[:com+1] + "com"
        email = re.sub(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}).*', r'\1', email)
        return email if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email) else None

    @staticmethod
    @udf(returnType=StringType())
    def validate_transaction_id(trx_id: str) -> str:
        if trx_id is None:
            return None
        trx_id = trx_id.strip()
        numeric_part = re.sub(r'\D', '', trx_id)
        return f"trx-{numeric_part}" if numeric_part else None

    @staticmethod
    @udf(returnType=DoubleType())
    def validate_unit_price(price):
        return price if price >= 0 else (-1 * price)
    
    @staticmethod    
    def rearrange_columns(df: DataFrame) -> DataFrame:
        new_order = [
            'transaction_id', 'transaction_date', 'customer_id', 'customer_name', 'customer_email',
            'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
            'payment_method', 'group', 'is_online', 'sales_agent_id', 'sales_agent_name',
            'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
            'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
        ]
        return df.select(new_order)

    @staticmethod
    def convert_dates_to_date_type(df: DataFrame) -> DataFrame:
        return df.withColumn('transaction_date', to_date(col('transaction_date'))) \
                 .withColumn('branch_establish_date', to_date(col('branch_establish_date'))) \
                 .withColumn('sales_agent_hire_date', to_date(col('sales_agent_hire_date')))

    @staticmethod
    def convert_ids_to_long_type(df: DataFrame) -> DataFrame:
        return df.withColumn('sales_agent_id', col('sales_agent_id').cast(LongType())) \
                 .withColumn('branch_id', col('branch_id').cast(LongType()))

    @staticmethod
    def split_shipping_address(df: DataFrame) -> DataFrame:
        return df.withColumn("shipping_address_split", split(col("shipping_address"), "/")) \
                 .withColumn("shipping_street_name", col("shipping_address_split")[0]) \
                 .withColumn("shipping_city", col("shipping_address_split")[1]) \
                 .withColumn("shipping_state", col("shipping_address_split")[2]) \
                 .withColumn("shipping_zip_code", col("shipping_address_split")[3]) \
                 .drop("shipping_address", "shipping_address_split")

    @staticmethod
    def map_shipping_state(spark: SparkSession, df: DataFrame, state_dict: Dict[str, str]) -> DataFrame:
        broadcast_dict = spark.sparkContext.broadcast(state_dict)
        conditions = coalesce(*[when(col("shipping_state") == key, lit(value)) for key, value in broadcast_dict.value.items()])
        return df.withColumn("shipping_state_mapped", when(conditions.isNotNull(), conditions).otherwise(col("shipping_state"))) \
                 .drop("shipping_state") \
                 .withColumnRenamed("shipping_state_mapped", "shipping_state")

In [4]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import Dict, List, Callable
from datetime import datetime
from transformation import DataTransformer
import re

# Data Quality Layer
class DataQualityLayer:
    def __init__(self, spark: SparkSession):
        self.spark = spark
        self.state_dict = {
            'AZ': 'Arizona', 'DC': 'District of Columbia', 'KY': 'Kentucky',
            'CA': 'California', 'CT': 'Connecticut', 'VT': 'Vermont',
            'MD': 'Maryland', 'AL': 'Alabama', 'TN': 'Tennessee',
            'GA': 'Georgia', 'MA': 'Massachusetts', 'FL': 'Florida',
            'CO': 'Colorado', 'AK': 'Alaska', 'AR': 'Arkansas',
            'OK': 'Oklahoma', 'Washington': 'Washington'
        }
        self.offers_dict = {
            "null": 0.0, "offer_1": 0.05, "offer_2": 0.1,
            "offer_3": 0.15, "offer_4": 0.20, "offer_5": 0.25
        }

    def split_online_offline(self, df: DataFrame) -> Dict[str, DataFrame]:
        return {
            "online": df.filter(col("is_online") == "yes").drop("is_online"),
            "offline": df.filter(col("is_online") == "no").drop("is_online")
        }

    def apply_common_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            DataTransformer.rename_columns,
            DataTransformer.remove_blank_columns,
            lambda df: DataTransformer.map_offers_to_discount(self.spark, df, self.offers_dict),
            DataTransformer.merge_customer_name,
            lambda df: df.withColumn("customer_email", DataTransformer.clean_email(col("customer_email"))),
            lambda df: df.withColumn("transaction_id", DataTransformer.validate_transaction_id(col("transaction_id")))
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def apply_offline_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            DataTransformer.convert_dates_to_date_type,
            DataTransformer.convert_ids_to_long_type,
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def apply_online_transformations(self, df: DataFrame) -> DataFrame:
        transformations = [
            lambda df: df.withColumn('transaction_date', to_date(col('transaction_date'))),
            DataTransformer.split_shipping_address,
            lambda df: DataTransformer.map_shipping_state(self.spark, df, self.state_dict),
        ]

        for transform in transformations:
            df = transform(df)
        return df

    def add_row_index(self, df: DataFrame) -> DataFrame:
        return df.withColumn("row_index", monotonically_increasing_id())

In [5]:
spark = SparkSession.builder.appName("DataQualityLayer").getOrCreate()

In [20]:
raw_df = DataReader.read_latest_parquet(spark, Config.RAW_BASE_PATH, num_partitions=10)

Processing file: hdfs://localhost:9000/user/itversity/q-company_raw_layer/raw_sales_transactions_2024-07-10/0dfc5c25529b4dcaa75b14ffb6030706-0.parquet
Attempt 1 failed with error: An error occurred while calling o177.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 11, itvdelab, executor 2): org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:226)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:290)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:538)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:613)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:605)
	at org.a

NameError: name 'traceback' is not defined

In [226]:
raw_df.show(2, False, True)

-RECORD 0-----------------------------------------
 transaction_date | 2023-11-1                     
 transaction_id   | trx-570590551801              
 customer_id      | 85482                         
 customer_fname   | William                       
 cusomter_lname   | Miller                        
 cusomter_email   | william.miller@hotmail.com$u  
 sales_agent_id   | 9.0                           
 branch_id        | 2.0                           
 product_id       | 17                            
 product_name     | Blouse                        
 product_category | Clothing                      
 offer_1          | null                          
 offer_2          | null                          
 offer_3          | true                          
 offer_4          | null                          
 offer_5          | null                          
 units            | 3                             
 unit_price       | 29.99                         
 is_online        | no         

In [170]:
df = DataTransformer.rename_columns(raw_df)

In [171]:
df = DataTransformer.remove_blank_columns(df)

transaction_date,transaction_id,customer_id,customer_fname,customer_lname,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,offer_1,offer_2,offer_3,offer_4,offer_5,units,unit_price,is_online,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group
2023-11-1,trx-570590551801,85482,William,Miller,william.miller@ho...,9.0,2.0,17,Blouse,Clothing,,,True,,,3,29.99,no,Cash,,Daniel Martinez,2021-8-26,Los Angeles,2016-07-28,B,group4
2022-5-19,trx-847915039036,85512,Alexander,Moore,alexander.moore@o...,5.0,3.0,17,Blouse,Clothing,,,,True,,9,29.99,no,Cash,,David Wilson,2019-3-19,Chicago,2015-03-10,A,group4
2022-12-10,trx-784537720750,85495,Alexander,Wilson,alexander.wilson@...,2.0,3.0,28,Hair Dryer,Appliances,,,,True,,1,19.99,no,Credit Card,,Jane Smith,2018-12-19,Chicago,2015-03-10,A,group4
2023-12-7,trx-182507896138,85556,William,Miller,william.miller@ya...,,,17,Blouse,Clothing,,,,,True,10,29.99,yes,Credit Card,800 Old Stage Roa...,,,,,,group4
2022-1-13,trx-787890698807,85466,William,Johnson,william.johnson@g...,4.0,5.0,19,Sandals,Footwear,,True,,,,4,29.99,no,Cash,,Emily Brown,2020-7-25,Phoenix,2017-09-20,C,group4
2023-7-20,trx-002574051143,85535,Olivia,Smith,olivia.smith@gmai...,,,4,Headphones,Electronics,,,,,,8,99.99,yes,PayPal,12 Fletcher Lane/...,,,,,,group4
2023-11-16,trx-950189370720,85516,Ava,Davis,ava.davis@hotmail...,10.0,1.0,3,Tablet,Electronics,,,,,,3,299.99,no,Credit Card,,Sophia Moore,2019-1-4,New York,2017-01-15,A,group4
2022-8-15,trx-234964324036,85533,Alexander,Davis,alexander.davis@g...,2.0,3.0,29,Hair Straightener,Appliances,True,,,,,3,39.99,no,Credit Card,,Jane Smith,2018-12-19,Chicago,2015-03-10,A,group4
2022-1-15,trx-862795076664,85538,James,Miller,james.miller@yaho...,1.0,2.0,15,Hoodie,Clothing,,True,,,,5,29.99,no,Credit Card,,John Doe,2020-9-10,Los Angeles,2016-07-28,B,group4
2022-1-18,trx-402069615475,85467,Emma,Williams,emma.williams@gma...,2.0,3.0,13,Printer,Electronics,True,,,,,6,149.99,no,Credit Card,,Jane Smith,2018-12-19,Chicago,2015-03-10,A,group4


In [174]:
df = DataTransformer.map_offers_to_discount(spark, df, dq_layer.offers_dict)

In [176]:
df = DataTransformer.merge_customer_name(df)

In [178]:
df = df.withColumn("customer_email", DataTransformer.clean_email(col("customer_email")))

In [181]:
df = df.withColumn("transaction_id", DataTransformer.validate_transaction_id(col("transaction_id")))

In [188]:
df = df.withColumn("unit_price", DataTransformer.validate_unit_price(col("unit_price")))

In [199]:
df_splits = dq_layer.split_online_offline(df)

In [193]:
DataTransformer.convert_dates_to_date_type(df_splits['online'])

transaction_date,transaction_id,customer_id,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,discount,customer_name
2023-01-01,trx-332766751554,85469,john.taylor@yahoo...,,,17,Blouse,Clothing,8,29.99,Credit Card,1983 Reidsville S...,,,,,,group4,0.1,John Taylor
2022-06-06,trx-602377989729,85517,michael.smith@yah...,,,18,Boots,Footwear,5,149.99,Stripe,3521 Stargate Cir...,,,,,,group4,0.0,Michael Smith
2022-06-01,trx-602271810779,85555,james.miller@gmai...,,,28,Hair Dryer,Appliances,9,19.99,PayPal,82 Queen Court/Ma...,,,,,,group4,0.05,James Miller
2023-06-27,trx-307961901975,85540,emma.williams@gma...,,,2,Smartphone,Electronics,8,699.99,Stripe,3203 US Highway 9...,,,,,,group4,0.1,Emma Williams
2022-10-15,trx-591458426082,85522,alexander.william...,,,29,Hair Straightener,Appliances,5,39.99,PayPal,707 Leaning Oaks ...,,,,,,group4,0.1,Alexander Williams
2022-08-12,trx-466005084279,85467,emma.williams@gma...,,,18,Boots,Footwear,2,149.99,Credit Card,9306 Norton Commo...,,,,,,group4,0.15,Emma Williams
2022-06-26,trx-097381148420,85492,john.davis@hotmai...,,,15,Hoodie,Clothing,9,29.99,Credit Card,105 West Wareingw...,,,,,,group4,0.25,John Davis
2023-02-23,trx-686849132671,85533,alexander.davis@g...,,,27,Iron,Appliances,2,29.99,Credit Card,1523 South 9th St...,,,,,,group4,0.15,Alexander Davis
2022-02-13,trx-946382434175,85549,michael.smith@hot...,,,22,Coffee Maker,Appliances,4,79.99,Credit Card,3528 Seasons Driv...,,,,,,group4,0.0,Michael Smith
2023-05-17,trx-995839688738,85544,alexander.smith@y...,,,13,Printer,Electronics,8,149.99,Credit Card,67 Steeplechase D...,,,,,,group4,0.05,Alexander Smith


In [194]:
DataTransformer.convert_ids_to_long_type(df_splits['online'])

transaction_date,transaction_id,customer_id,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,payment_method,shipping_address,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,discount,customer_name
2023-1-1,trx-332766751554,85469,john.taylor@yahoo...,,,17,Blouse,Clothing,8,29.99,Credit Card,1983 Reidsville S...,,,,,,group4,0.1,John Taylor
2022-6-6,trx-602377989729,85517,michael.smith@yah...,,,18,Boots,Footwear,5,149.99,Stripe,3521 Stargate Cir...,,,,,,group4,0.0,Michael Smith
2022-6-1,trx-602271810779,85555,james.miller@gmai...,,,28,Hair Dryer,Appliances,9,19.99,PayPal,82 Queen Court/Ma...,,,,,,group4,0.05,James Miller
2023-6-27,trx-307961901975,85540,emma.williams@gma...,,,2,Smartphone,Electronics,8,699.99,Stripe,3203 US Highway 9...,,,,,,group4,0.1,Emma Williams
2022-10-15,trx-591458426082,85522,alexander.william...,,,29,Hair Straightener,Appliances,5,39.99,PayPal,707 Leaning Oaks ...,,,,,,group4,0.1,Alexander Williams
2022-8-12,trx-466005084279,85467,emma.williams@gma...,,,18,Boots,Footwear,2,149.99,Credit Card,9306 Norton Commo...,,,,,,group4,0.15,Emma Williams
2022-6-26,trx-097381148420,85492,john.davis@hotmai...,,,15,Hoodie,Clothing,9,29.99,Credit Card,105 West Wareingw...,,,,,,group4,0.25,John Davis
2023-2-23,trx-686849132671,85533,alexander.davis@g...,,,27,Iron,Appliances,2,29.99,Credit Card,1523 South 9th St...,,,,,,group4,0.15,Alexander Davis
2022-2-13,trx-946382434175,85549,michael.smith@hot...,,,22,Coffee Maker,Appliances,4,79.99,Credit Card,3528 Seasons Driv...,,,,,,group4,0.0,Michael Smith
2023-5-17,trx-995839688738,85544,alexander.smith@y...,,,13,Printer,Electronics,8,149.99,Credit Card,67 Steeplechase D...,,,,,,group4,0.05,Alexander Smith


In [200]:
df_splits['online'] = dq_layer.apply_online_transformations(df_splits['online'])

In [201]:
df_splits['online'].printSchema()

root
 |-- transaction_date: date (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- sales_agent_id: double (nullable = true)
 |-- branch_id: double (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- sales_agent_name: string (nullable = true)
 |-- sales_agent_hire_date: string (nullable = true)
 |-- branch_location: string (nullable = true)
 |-- branch_establish_date: string (nullable = true)
 |-- branch_class: string (nullable = true)
 |-- group: string (nullable = true)
 |-- discount: float (nullable = false)
 |-- customer_name: string (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- shipping_city: string (nullable = true)
 |-- shipp

In [202]:
df_splits['online'].show(2, False, True)

-RECORD 0-----------------------------------------
 transaction_date      | 2023-01-01               
 transaction_id        | trx-332766751554         
 customer_id           | 85469                    
 customer_email        | john.taylor@yahoo..com   
 sales_agent_id        | null                     
 branch_id             | null                     
 product_id            | 17                       
 product_name          | Blouse                   
 product_category      | Clothing                 
 units                 | 8                        
 unit_price            | 29.99                    
 payment_method        | Credit Card              
 sales_agent_name      | null                     
 sales_agent_hire_date | null                     
 branch_location       | null                     
 branch_establish_date | null                     
 branch_class          | null                     
 group                 | group4                   
 discount              | 0.1   

In [203]:
df_splits['online'].select(Schemas.online_transactions.fieldNames())

transaction_date,transaction_id,customer_id,customer_name,customer_email,product_id,product_name,product_category,units,unit_price,discount,payment_method,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,group
2023-03-06,trx-252944566828,85488,James Wilson,james.wilson@outl...,12,Monitor,Electronics,7,299.99,0.05,PayPal,8502 Madrone Avenue,Louisville,Kentucky,40258,group4
2022-05-27,trx-083113101686,85544,Alexander Smith,alexander.smith@y...,12,Monitor,Electronics,2,299.99,0.0,Credit Card,8 Richardson Road,Barre,Vermont,5641,group4
2023-02-25,trx-034911454756,85508,Emma Wilson,emma.wilson@hotma...,13,Printer,Electronics,9,149.99,0.15,Credit Card,4714 Narrow Lane ...,Montgomery,Alabama,36116,group4
2023-05-21,trx-905427223281,85506,Olivia Williams,olivia.williams@h...,29,Hair Straightener,Appliances,5,39.99,0.0,Credit Card,6 Little Country ...,Savannah,Georgia,31406,group4
2022-04-02,trx-572917591034,85525,Emma Smith,emma.smith@gmail....,26,Vacuum Cleaner,Appliances,6,199.99,0.1,Credit Card,102 East Cydnee S...,Fayetteville,Arkansas,72703,group4
2022-08-23,trx-192158591006,85519,Alexander Taylor,alexander.taylor@...,22,Coffee Maker,Appliances,2,79.99,0.25,PayPal,10151 West 64th A...,Arvada,Colorado,80004,group4
2023-10-16,trx-686233258918,85488,James Wilson,james.wilson@outl...,20,Heels,Footwear,3,59.99,0.15,Stripe,115 Falkirk Street,Savannah,Georgia,31407,group4
2023-02-10,trx-049918901166,85487,James Miller,james.miller@gmai...,24,Blender,Appliances,2,49.99,0.2,PayPal,394 Hilltop Lane,Annapolis,Maryland,21403,group4
2022-01-07,trx-185242020527,85538,James Miller,james.miller@yaho...,10,Sandals,Footwear,10,39.99,0.15,PayPal,3541 North Road,Maidstone,Vermont,5905,group4
2023-07-04,trx-336890800434,85560,Michael Taylor,michael.taylor@ya...,7,Dress,Clothing,7,59.99,0.1,Stripe,275 Ridge Lane,Waltham,Massachusetts,2452,group4


In [206]:
df_splits['offline'] = dq_layer.apply_offline_transformations(df_splits['offline'])

In [207]:
df_splits['offline'].select(Schemas.offline_transactions.fieldNames())

transaction_date,transaction_id,customer_id,customer_name,customer_email,sales_agent_id,branch_id,product_id,product_name,product_category,units,unit_price,discount,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group
2023-01-01,trx-831197918842,85462,Emma Williams,emma.williams@hot...,10,2,7,Dress,Clothing,7,59.99,0.0,Cash,Sophia Moore,2019-01-04,Los Angeles,2016-07-28,B,group4
2022-10-28,trx-007466656258,85549,Michael Smith,michael.smith@hot...,5,2,7,Dress,Clothing,1,59.99,0.0,Cash,David Wilson,2019-03-19,Los Angeles,2016-07-28,B,group4
2023-12-18,trx-250889464286,85534,Mia Williams,mia.williams@hotm...,8,5,30,Electric Kettle,Appliances,10,24.99,0.0,Credit Card,Olivia Davis,2018-08-26,Phoenix,2017-09-20,C,group4
2022-03-19,trx-350471067618,85499,Ava Taylor,ava.taylor@outloo...,5,2,15,Hoodie,Clothing,10,29.99,0.2,Cash,David Wilson,2019-03-19,Los Angeles,2016-07-28,B,group4
2022-10-01,trx-645747300738,85525,Emma Smith,emma.smith@gmail....,7,1,1,Laptop,Electronics,10,999.99,0.0,Cash,Christopher Miller,2020-09-26,New York,2017-01-15,A,group4
2023-10-07,trx-913476359633,85506,Olivia Williams,olivia.williams@h...,3,1,1,Laptop,Electronics,3,999.99,0.0,Cash,Michael Johnson,2019-04-08,New York,2017-01-15,A,group4
2023-04-17,trx-914186159400,85555,James Miller,james.miller@gmai...,7,5,6,Jeans,Clothing,10,49.99,0.0,Credit Card,Christopher Miller,2020-09-26,Phoenix,2017-09-20,C,group4
2022-10-04,trx-214389519326,85499,Ava Taylor,ava.taylor@outloo...,3,3,14,Camera,Electronics,6,399.99,0.0,Cash,Michael Johnson,2019-04-08,Chicago,2015-03-10,A,group4
2022-08-24,trx-061213947441,85517,Michael Smith,michael.smith@yah...,3,4,27,Iron,Appliances,1,29.99,0.0,Credit Card,Michael Johnson,2019-04-08,Houston,2016-11-05,D,group4
2022-12-07,trx-767700910919,85531,Olivia Williams,olivia.williams@h...,6,3,19,Sandals,Footwear,4,29.99,0.0,Credit Card,Emma Taylor,2019-01-15,Chicago,2015-03-10,A,group4


In [227]:
dq_layer = DataQualityLayer(spark)

In [228]:
transformed_df = dq_layer.apply_common_transformations(raw_df)

In [229]:
transformed_df.show(3, False, True)

-RECORD 0----------------------------------------------
 transaction_date      | 2023-11-1                     
 transaction_id        | trx-570590551801              
 customer_id           | 85482                         
 customer_email        | william.miller@hotmail..com   
 sales_agent_id        | 9.0                           
 branch_id             | 2.0                           
 product_id            | 17                            
 product_name          | Blouse                        
 product_category      | Clothing                      
 units                 | 3                             
 unit_price            | 29.99                         
 is_online             | no                            
 payment_method        | Cash                          
 shipping_address      | null                          
 sales_agent_name      | Daniel Martinez               
 sales_agent_hire_date | 2021-8-26                     
 branch_location       | Los Angeles            

In [145]:
transformed_df.printSchema()

root
 |-- transaction_date: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- sales_agent_id: double (nullable = true)
 |-- branch_id: double (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- is_online: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_address: string (nullable = true)
 |-- sales_agent_name: string (nullable = true)
 |-- sales_agent_hire_date: string (nullable = true)
 |-- branch_location: string (nullable = true)
 |-- branch_establish_date: string (nullable = true)
 |-- branch_class: string (nullable = true)
 |-- group: string (nullable = true)
 |-- discount: float (nullable = false)
 |-- customer_name: string (nullable = true)



In [230]:
split_dfs = dq_layer.split_online_offline(transformed_df)

In [231]:
split_dfs['online'].show(2, False, True)

-RECORD 0----------------------------------------------------------
 transaction_date      | 2023-1-1                                  
 transaction_id        | trx-332766751554                          
 customer_id           | 85469                                     
 customer_email        | john.taylor@yahoo..com                    
 sales_agent_id        | null                                      
 branch_id             | null                                      
 product_id            | 17                                        
 product_name          | Blouse                                    
 product_category      | Clothing                                  
 units                 | 8                                         
 unit_price            | 29.99                                     
 payment_method        | Credit Card                               
 shipping_address      | 1983 Reidsville Street/Annapolis/MD/21401 
 sales_agent_name      | null                   

In [232]:
split_dfs['online'].show(2, False, True)

-RECORD 0----------------------------------------------------------
 transaction_date      | 2023-1-1                                  
 transaction_id        | trx-332766751554                          
 customer_id           | 85469                                     
 customer_email        | john.taylor@yahoo..com                    
 sales_agent_id        | null                                      
 branch_id             | null                                      
 product_id            | 17                                        
 product_name          | Blouse                                    
 product_category      | Clothing                                  
 units                 | 8                                         
 unit_price            | 29.99                                     
 payment_method        | Credit Card                               
 shipping_address      | 1983 Reidsville Street/Annapolis/MD/21401 
 sales_agent_name      | null                   

In [234]:
for df_type, df in split_dfs.items():
        if df_type == "offline":
            df = dq_layer.apply_offline_transformations(df)
            df = df.select(Schemas.offline_transactions.fieldNames())
        else:  # online
            df = dq_layer.apply_online_transformations(df)
            df = df.select(Schemas.online_transactions.fieldNames())
        
        df_with_index = dq_layer.add_row_index(df)
        DataWriter.write_parquet(spark, df_with_index, Config.STANDARDIZED_BASE_PATH, df_type, ["transaction_date"])
        

Created directory: /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10
Written online transactions for group [Row(group='group4')] to /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10/online_transactions_[Row(group='group4')]_20240710135203
Directory already exists: /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10
Written offline transactions for group [Row(group='group4')] to /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-10/offline_transactions_[Row(group='group4')]_20240710135301


In [152]:
split_dfs['offline'].show(2, False, True)

Py4JJavaError: An error occurred while calling o3267.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 90.0 failed 4 times, most recent failure: Lost task 0.3 in stage 90.0 (TID 339, itvdelab, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2/python/pyspark/worker.py", line 377, in main
    process()
  File "/opt/spark2/python/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2/python/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/opt/spark2/python/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/opt/spark2/python/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/opt/spark2/python/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/opt/spark2/python/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/home/itversity/itversity-material/Q-company_project/Scripts/transformation.py", line 68, in validate_unit_price
    return abs(price)
  File "/opt/spark2/python/pyspark/sql/functions.py", line 44, in _
    jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)
AttributeError: 'NoneType' object has no attribute '_jvm'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2088)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2107)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:370)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3388)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3369)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3368)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor82.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2/python/pyspark/worker.py", line 377, in main
    process()
  File "/opt/spark2/python/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2/python/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/opt/spark2/python/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/opt/spark2/python/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/opt/spark2/python/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/opt/spark2/python/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/home/itversity/itversity-material/Q-company_project/Scripts/transformation.py", line 68, in validate_unit_price
    return abs(price)
  File "/opt/spark2/python/pyspark/sql/functions.py", line 44, in _
    jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)
AttributeError: 'NoneType' object has no attribute '_jvm'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:260)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:252)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:411)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:417)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
split_dfs['online'].show(2, False, True)

In [21]:
spark.stop()