In [4]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import col, lit, when, coalesce, split, concat, udf, \
                                    regexp_replace, lower, monotonically_increasing_id, regexp_extract, create_map
from itertools import chain
from typing import Dict
import re 
from datetime import datetime

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("DataQualityLayer")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

In [5]:
def get_latest_file(spark, hdfs_path):
    # List all files in the directory
    files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
        .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
    
    # Sort files by modification time (most recent first)
    sorted_files = sorted(files, key=lambda f: f.getModificationTime(), reverse=True)
    
    if not sorted_files:
        return None
    
    # Get the path of the most recent file
    return sorted_files[0].getPath().toString()


'/user/itversity/q-company_raw_layer/sales_transaction_2024-07-09.parquet'

In [None]:
current_date = datetime.now().strftime("%Y-%m-%d")
    
# Construct HDFS path
hdfs_base_path = "/user/itversity/q-company_raw_layer"
hdfs_path = f"{hdfs_base_path}/sales_transactions_{current_date}"

# Get the latest file
latest_file = get_latest_file(spark, hdfs_path)

if latest_file:
    print(f"Processing file: {latest_file}")

    # Read the Parquet file
    df = spark.read.parquet(latest_file)

       
    # Show some results
    df_processed.show()

        
else:
    print(f"No files found in {hdfs_path}")


In [106]:
#ٍ1. Tables Schemas
#Raw Data Schema
raw_dataSchema = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("customer_fname", StringType(), nullable=False),
    StructField("customer_lname", StringType(), nullable=False),
    StructField("customer_email", StringType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=True),
    StructField("branch_id", StringType(), nullable=True),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("offer_1", StringType(), nullable=True),
    StructField("offer_2", StringType(), nullable=True),
    StructField("offer_3", StringType(), nullable=True),
    StructField("offer_4", StringType(), nullable=True),
    StructField("offer_5", StringType(), nullable=True),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("shipping_address", StringType(), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("hire_date", DateType(), nullable=True),
    StructField("location", StringType(), nullable=True),
    StructField("establish_date", DateType(), nullable=True),
    StructField("class", StringType(), nullable=True),
    StructField("group", StringType(), nullable=False),
    StructField("logs", StringType(), nullable=True),
    StructField("source", StringType(), nullable=True)
])

#Offline Transactions Schema
offline_transactions = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("customer_email", StringType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=False),
    StructField("branch_id", StringType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("sales_agent_name", StringType(), nullable=False),
    StructField("sales_agent_hire_date", DateType(), nullable=False),
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", DateType(), nullable=False),
    StructField("branch_class", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False)
])

#Online Transactions Schema
online_transactions = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cusomter_email", StringType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("shipping_street_name",  StringType(), nullable=False), 
    StructField("shipping_city",  StringType(), nullable=False),
    StructField("shipping_state",  StringType(), nullable=False),
    StructField("shipping_zip_code",  StringType(), nullable=False),
    StructField("group", StringType(), nullable=False)
])

#Products Schema
products_schema = StructType([
    StructField("product_id", StringType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False)   
])

#Customers Schema
customers_schema = StructType([
    StructField("customer_id", LongType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cusomter_email", StringType(), nullable=False)
])

#Sales Offline Transactions Schema
offline_transactions = StructType([
    StructField("transaction_id", StringType(), nullable=False),
    StructField("transaction_date", DateType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=False),
    StructField("branch_id", StringType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False)
])

#Sales Online Transactions Schema
online_transactions = StructType([
    StructField("transaction_date", DateType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("product_id", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("location_id",  StringType(), nullable=False), 
    StructField("group", StringType(), nullable=False)
])

#Branches Schema
branches_schema = StructType([
    StructField("branch_id", StringType(), nullable=False),
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", DateType(), nullable=False),
    StructField("branch_class", StringType(), nullable=False)
])

#Sales Agents Schema
sales_agents_schema = StructType([
    StructField("sales_agent_id", StringType(), nullable=False),
    StructField("sales_agent_name", StringType(), nullable=False),
    StructField("sales_agent_hire_date", DateType(), nullable=False)
])

#Locations Schema
locations_schema = StructType([
    StructField("location_id", StringType(), nullable=False),
    StructField("location_street_name", StringType(), nullable=False),
    StructField("location_city", StringType(), nullable=False),
    StructField("location_state", StringType(), nullable=False),
    StructField("location_zip_code", StringType(), nullable=False)
])


In [107]:
#2. Reading the group ingested data file from the raw layer with inforced schema defintion
raw_df = spark.read.csv("/user/q-company/raw_layer_test/all_groups_merged.csv", schema=raw_dataSchema, header=True)

#3. Generate the surrogate key row_index
raw_df = raw_df.withColumn("row_index", monotonically_increasing_id())

raw_df.show(n=2, truncate=False, vertical=True)

-RECORD 0---------------------------------------
 transaction_date | 2023-05-20                  
 transaction_id   | trx-152546429674            
 customer_id      | 85469                       
 customer_fname   | Alexander                   
 customer_lname   | Brown                       
 customer_email   | alexander.brown@gmail.com"" 
 sales_agent_id   | 1.0                         
 branch_id        | 2.0                         
 product_id       | 22                          
 product_name     | Coffee Maker                
 product_category | Appliances                  
 offer_1          | null                        
 offer_2          | null                        
 offer_3          | null                        
 offer_4          | null                        
 offer_5          | null                        
 units            | 10                          
 unit_price       | 79.99                       
 is_online        | no                          
 payment_method   | 

In [109]:
#4. Rename some columns names of the sales agents and branches columns
def rename_columns(df: DataFrame) -> DataFrame:
    return df.withColumnRenamed("name", "sales_agent_name") \
                .withColumnRenamed("hire_date", "sales_agent_hire_date") \
                .withColumnRenamed("location", "branch_location") \
                .withColumnRenamed("establish_date", "branch_establish_date") \
                .withColumnRenamed("class", "branch_class")

renamed_df = rename_columns(raw_df)

renamed_df.columns

['transaction_date',
 'transaction_id',
 'customer_id',
 'customer_fname',
 'customer_lname',
 'customer_email',
 'sales_agent_id',
 'branch_id',
 'product_id',
 'product_name',
 'product_category',
 'offer_1',
 'offer_2',
 'offer_3',
 'offer_4',
 'offer_5',
 'units',
 'unit_price',
 'is_online',
 'payment_method',
 'shipping_address',
 'sales_agent_name',
 'sales_agent_hire_date',
 'branch_location',
 'branch_establish_date',
 'branch_class',
 'group',
 'logs',
 'source',
 'row_index']

In [110]:
#5. Remove Blank Columns
def remove_blank_columns(df: DataFrame) -> DataFrame:
    return df.select([c for c in df.columns if df.filter(col(c).isNotNull()).count() > 0])

raw_without_blanks_cols = remove_blank_columns(renamed_df)

raw_without_blanks_cols.show(2, False, True)

-RECORD 0--------------------------------------------
 transaction_date      | 2023-05-20                  
 transaction_id        | trx-152546429674            
 customer_id           | 85469                       
 customer_fname        | Alexander                   
 customer_lname        | Brown                       
 customer_email        | alexander.brown@gmail.com"" 
 sales_agent_id        | 1.0                         
 branch_id             | 2.0                         
 product_id            | 22                          
 product_name          | Coffee Maker                
 product_category      | Appliances                  
 offer_1               | null                        
 offer_2               | null                        
 offer_3               | null                        
 offer_4               | null                        
 offer_5               | null                        
 units                 | 10                          
 unit_price            | 79.

In [46]:
#6. Splitting the shipping address into (shipping_street_name, shipping_city, shipping_state, shipping_zip_code)
def split_shipping_address(df: DataFrame) -> DataFrame:
    return df.withColumn("shipping_address_split", split(col("shipping_address"), "/")) \
                .withColumn("shipping_street_name", col("shipping_address_split")[0]) \
                .withColumn("shipping_city", col("shipping_address_split")[1]) \
                .withColumn("shipping_state", col("shipping_address_split")[2]) \
                .withColumn("shipping_zip_code", col("shipping_address_split")[3]) \
                .drop("shipping_address", "shipping_address_split")

raw_splited_address = split_shipping_address(raw_without_blanks_cols)

raw_splited_address.filter(raw_splited_address.is_online.isin('yes')).show(2, False, True)

-RECORD 0---------------------------------------
 transaction_date     | 2022-11-24              
 transaction_id       | trx-630807021567        
 customer_id          | 85541                   
 customer_fname       | Ava                     
 customer_lname       | Jones                   
 customer_email       | ava.jones@hotmail.com:  
 sales_agent_id       | null                    
 branch_id            | null                    
 product_id           | 27                      
 product_name         | Iron                    
 product_category     | Appliances              
 offer_1              | null                    
 offer_2              | null                    
 offer_3              | null                    
 offer_4              | null                    
 offer_5              | null                    
 units                | 6                       
 unit_price           | 29.99                   
 is_online            | yes                     
 payment_method     

In [59]:
#7. Mapping the values of the shipping_state columns from abbrevations to a full name 
state_dict = {
    'AZ': 'Arizona',
    'DC': 'District of Columbia',
    'KY': 'Kentucky',
    'CA': 'California',
    'CT': 'Connecticut',
    'VT': 'Vermont',
    'MD': 'Maryland',
    'AL': 'Alabama',
    'TN': 'Tennessee',
    'GA': 'Georgia',
    'MA': 'Massachusetts',
    'FL': 'Florida',
    'CO': 'Colorado',
    'AK': 'Alaska',
    'AR': 'Arkansas',
    'OK': 'Oklahoma',
    'Washington': 'Washington'  
}

def map_shipping_state(spark: SparkSession, df: DataFrame, state_dict: Dict[str, str]) -> DataFrame:
    broadcast_dict = spark.sparkContext.broadcast(state_dict)

    conditions = coalesce(*[when(col("shipping_state") == key, lit(value)) for key, value in broadcast_dict.value.items()])

    df = df.withColumn("shipping_state_mapped", when(conditions.isNotNull(), conditions).otherwise(col("shipping_state")))

    return df


mapped_shipped_state_df = map_shipping_state(spark, raw_splited_address, state_dict)

mapped_shipped_state_df = mapped_shipped_state_df.drop("shipping_state")

mapped_shipped_state_df = mapped_shipped_state_df.withColumnRenamed("shipping_state_mapped", "shipping_state")

mapped_shipped_state_df.filter(mapped_shipped_state_df.is_online.isin('yes')).show(2, False, True)


-RECORD 0---------------------------------------
 transaction_date     | 2022-11-24              
 transaction_id       | trx-630807021567        
 customer_id          | 85541                   
 customer_fname       | Ava                     
 customer_lname       | Jones                   
 customer_email       | ava.jones@hotmail.com:  
 sales_agent_id       | null                    
 branch_id            | null                    
 product_id           | 27                      
 product_name         | Iron                    
 product_category     | Appliances              
 offer_1              | null                    
 offer_2              | null                    
 offer_3              | null                    
 offer_4              | null                    
 offer_5              | null                    
 units                | 6                       
 unit_price           | 29.99                   
 is_online            | yes                     
 payment_method     

In [64]:
#8. Mapping offers columns into a one discount column 
offers_dict = {
    "null": 0.0,
    "offer_1": 0.05,
    "offer_2": 0.1,
    "offer_3": 0.15,
    "offer_4": 0.20,
    "offer_5": 0.20
}

def map_offers_to_discount(spark: SparkSession, df: DataFrame, offers_dict: Dict[str, float]) -> DataFrame:
    broadcast_offers = spark.sparkContext.broadcast(offers_dict)
    
    offer_columns = ["offer_1", "offer_2", "offer_3", "offer_4", "offer_5"]
    
    def create_offer_column(offer_col: str):
        return when(col(offer_col) == lit(True), lit(broadcast_offers.value[offer_col]))
    
    offer_discount_columns = [create_offer_column(offer_col).alias(f"{offer_col}_discount") for offer_col in offer_columns]
    
    df_with_offer_discounts = df.select("*", *offer_discount_columns)
    
    discount_column = coalesce(*[col(f"{offer_col}_discount") for offer_col in offer_columns], lit(broadcast_offers.value["null"]))
    
    return df_with_offer_discounts.withColumn("discount", discount_column.cast(FloatType())) \
                                  .drop(*[f"{offer_col}_discount" for offer_col in offer_columns]) \
                                    .drop("offer_1", "offer_2", "offer_3", "offer_4", "offer_5")

offers_mapped_df = map_offers_to_discount(spark, mapped_shipped_state_df, offers_dict)

offers_mapped_df

In [66]:
#9. Merge customer first name and second name into customer name column 
def merge_customer_name(df: DataFrame) -> DataFrame:
        return df.withColumn("customer_name", concat(col("customer_fname"), lit(" "),col("customer_lname"))) \
                 .drop("customer_fname", "customer_lname")

merged_custname_df = merge_customer_name(offers_mapped_df)

merged_custname_df.filter(merged_custname_df.is_online.isin('yes')).show(2, False, True)

-RECORD 0---------------------------------------
 transaction_date     | 2022-11-24              
 transaction_id       | trx-630807021567        
 customer_id          | 85541                   
 customer_email       | ava.jones@hotmail.com:  
 sales_agent_id       | null                    
 branch_id            | null                    
 product_id           | 27                      
 product_name         | Iron                    
 product_category     | Appliances              
 units                | 6                       
 unit_price           | 29.99                   
 is_online            | yes                     
 payment_method       | Credit Card             
 name                 | null                    
 hire_date            | null                    
 location             | null                    
 establish_date       | null                    
 class                | null                    
 group                | 1                       
 row_index          

In [89]:
#10. clean customer email into valid email formt
def clean_email(email: str) -> str:
    if email is None:
        return None
    
    email = email.strip()
    
    email = re.sub(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}).*', r'\1', email)
    
    if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
        return email
    else:
        return None  

clean_email_udf = udf(clean_email, StringType())

df_cleaned = df.withColumn("cleaned_email", clean_email_udf(merged_custname_df["customer_email"]))

df_cleaned = df_cleaned.drop("customer_email")

df_cleaned = df_cleaned.withColumnRenamed("customer_email", "cleaned_email")

df_cleaned.show(3, False, True)

-RECORD 0------------------------------------------
 transaction_date      | 2023-05-20                
 transaction_id        | trx-152546429674          
 customer_id           | 85469                     
 customer_fname        | Alexander                 
 customer_lname        | Brown                     
 sales_agent_id        | 1.0                       
 branch_id             | 2.0                       
 product_id            | 22                        
 product_name          | Coffee Maker              
 product_category      | Appliances                
 offer_1               | null                      
 offer_2               | null                      
 offer_3               | null                      
 offer_4               | null                      
 offer_5               | null                      
 units                 | 10                        
 unit_price            | 79.99                     
 is_online             | no                        
 payment_met

In [90]:
#11. clean the transaction_id and validate it's format
def validate_transaction_id(trx_id: str) -> str:
    if trx_id is None:
        return None
    
    trx_id = trx_id.strip()
    
    numeric_part = re.sub(r'\D', '', trx_id)
    
    if not numeric_part:
        return None
    
    valid_trx_id = f"trx-{numeric_part}"
    
    return valid_trx_id

validate_transaction_id = udf(validate_transaction_id, StringType())

df_cleaned = df_cleaned.withColumn("cleaned_transaction_id", validate_transaction_id(df["transaction_id"]))

df_cleaned = df_cleaned.drop("transaction_id")

df_cleaned = df_cleaned.withColumnRenamed("cleaned_transaction_id", "transaction_id")

df_cleaned.show(2, False, True)

-RECORD 0------------------------------------------
 transaction_date      | 2023-05-20                
 customer_id           | 85469                     
 customer_fname        | Alexander                 
 customer_lname        | Brown                     
 sales_agent_id        | 1.0                       
 branch_id             | 2.0                       
 product_id            | 22                        
 product_name          | Coffee Maker              
 product_category      | Appliances                
 offer_1               | null                      
 offer_2               | null                      
 offer_3               | null                      
 offer_4               | null                      
 offer_5               | null                      
 units                 | 10                        
 unit_price            | 79.99                     
 is_online             | no                        
 payment_method        | Cash                      
 name       

In [94]:
#12. validate the unit price column 
def validate_unit_price(price: float) -> float:
    if price is None:
        return None
    
    return price if price >= 0 else -1 * price
    
validate_unit_price_udf = udf(validate_unit_price, DoubleType())

df_cleaned = df_cleaned.withColumn("validated_unit_price", validate_unit_price_udf(df["unit_price"]))

df_cleaned = df_cleaned.drop("unit_price")

df_cleaned = df_cleaned.withColumnRenamed("validated_unit_price", "unit_price")

df_cleaned.show(2, False, True)

In [101]:
df_cleaned.columns

['transaction_date',
 'customer_id',
 'customer_fname',
 'customer_lname',
 'sales_agent_id',
 'branch_id',
 'product_id',
 'product_name',
 'product_category',
 'offer_1',
 'offer_2',
 'offer_3',
 'offer_4',
 'offer_5',
 'units',
 'is_online',
 'payment_method',
 'name',
 'hire_date',
 'location',
 'establish_date',
 'class',
 'group',
 'row_index',
 'shipping_street_name',
 'shipping_city',
 'shipping_state',
 'shipping_zip_code',
 'shipping_state_mapped',
 'cleaned_email',
 'transaction_id',
 'unit_price']

In [99]:
#1. Build Reference Data

#get unique values of product category and product name
products = df_cleaned.select("product_name").distinct()
product_category = df_cleaned.select("product_category").distinct()
product_category_distinct = df_cleaned.select("product_category", "product_name").distinct()

#get unique values of shipping locations
shipping_locations = df_cleaned.select("shipping_street_name", "shipping_state", 
                                       "shipping_city", "shipping_zip_code").distinct()
shipping_street_names = df_cleaned.select("shipping_street_name").distinct()
shipping_states = df_cleaned.select("shipping_state").distinct()
shipping_cities = df_cleaned.select("shipping_city").distinct()
shipping_zip_code = df_cleaned.select("shipping_zip_code").distinct()

#get unique branch location, class
branch_locations = df_cleaned.select("branch_location").distinct()
branch_classes = df_cleaned.select("branch_class").distinct()

#get unique groups
groups = df_cleaned.select("group").distinct()

In [157]:
online_df = result_df.filter("is_online == 'yes'").drop("sales_agent_id", "branch_id", "sales_agent_name", 
                                                         "sales_agent_hire_date", "branch_location",
                                                         "branch_establish_date", "branch_class")
offline_df = result_df.filter("is_online == 'no'").drop("shipping_street_name", "shipping_city", 
                                                        "shipping_state", "shipping_zip_code")

In [164]:
offline_df.coalesce(1)
online_df.coalesce(1)

transaction_date,transaction_id,customer_id,customer_email,product_id,product_name,product_category,units,unit_price,is_online,payment_method,group,row_index,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,discount,customer_name
2022-11-24,trx-630807021567,85541,ava.jones@hotmail...,27,Iron,Appliances,6,29.99,yes,Credit Card,1,0,43 Henderson Avenue,Savannah,GA,31406,0.0,Ava Jones
2023-01-20,trx-231766088020,85550,emma.taylor@gmail...,8,Sneakers,Footwear,6,79.99,yes,PayPal,1,1,8 Linden Circle,Somerville,MA,2143,0.0,Emma Taylor
2022-10-18,trx-334836612567,85479,james.jones@outlo...,6,Jeans,Clothing,2,49.99,yes,Stripe,1,2,1139 Addison Street,Berkeley,CA,94702,0.15,James Jones
2022-04-06,trx-678424403739,85551,michael.johnson@y...,10,Sandals,Footwear,3,39.99,yes,Credit Card,1,3,521 McGregor Court,Montgomery,AL,36117,0.0,Michael Johnson
2022-08-09,trx-034026221655,85470,ava.wilson@hotmai...,3,Tablet,Electronics,6,299.99,yes,Stripe,1,4,10802 Brickside C...,Riverview,FL,33579,0.0,Ava Wilson
2023-10-27,trx-334049470367,85507,ava.miller@outloo...,25,Washing Machine,Appliances,5,499.99,yes,Credit Card,1,5,131 Westerly Street,Manchester,CT,6042,0.2,Ava Miller
2023-03-14,trx-354281638130,85466,michael.brown@yah...,16,Skirt,Clothing,9,39.99,yes,PayPal,1,6,5403 Illinois Avenue,Nashville,TN,37209,0.0,Michael Brown
2023-11-06,trx-919712049046,85493,james.johnson@gma...,27,Iron,Appliances,6,29.99,yes,Credit Card,1,7,10304 North 179th...,Waddell,AZ,85355,0.2,James Johnson
2023-02-17,trx-947364728949,85550,emma.taylor@gmail...,20,Heels,Footwear,9,59.99,yes,Stripe,1,8,10340 West 62nd P...,Arvada,CO,80004,0.0,Emma Taylor
2023-01-12,trx-624869757221,85499,sophia.miller@hot...,3,Tablet,Electronics,1,299.99,yes,PayPal,1,9,629 Cutter Court,Annapolis,MD,21401,0.0,Sophia Miller


In [166]:
offline_df.write.csv("file:///data/dd/offline.csv")

In [167]:
online_df.write.csv("file:///data/dd/online.csv")

In [6]:
spark.stop()