In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum



In [2]:
spark = SparkSession\
    .builder\
    .appName("B2B_daily_sales_dump")\
    .getOrCreate()


In [4]:
#Offline Transactions Schema
offline_transactions = StructType([
    StructField("transaction_date", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=False),
    StructField("branch_id", StringType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("sales_agent_name", StringType(), nullable=False),
    StructField("sales_agent_hire_date", StringType(), nullable=False),
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", StringType(), nullable=False),
    StructField("branch_class", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False),
    StructField("row_index", StringType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cleaned_email", StringType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False)
])

# Path to the directory containing the CSV files
offline_merged = "/Graduation_Project/cleaned_data/offline/*/*.csv"

# Read all CSV files in the directory into a single DataFrame without schema for verification
offline_merged_df = spark.read.option("header", "true").schema(offline_transactions).csv(offline_merged)
offline_merged_df.coalesce(1)

transaction_date,customer_id,sales_agent_id,branch_id,product_id,product_name,product_category,units,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,discount,customer_name,cleaned_email,transaction_id,unit_price
2022-07-19,85513,10.0,6.0,27,Iron,Appliances,7,no,Cash,Sophia Moore,2019-11-10,Oklahoma,2016-09-20,A,6,0,0.0,Alexander Johnson,alexander.johnson...,trx-878108770002,29.99
2023-08-06,85510,2.0,6.0,28,Hair Dryer,Appliances,10,no,Cash,Jane Smith,2021-06-08,Oklahoma,2016-09-20,A,6,1,0.1,Ava Smith,ava.smith@gmail.com,trx-349443438637,19.99
2022-12-28,85553,6.0,4.0,13,Printer,Electronics,2,no,Credit Card,Emma Taylor,2018-08-09,Houston,2016-11-05,D,6,2,0.15,Mia Brown,mia.brown@hotmail...,trx-045891300294,149.99
2023-06-28,85520,3.0,2.0,12,Monitor,Electronics,6,no,Credit Card,Michael Johnson,2019-07-22,Los Angeles,2016-07-28,B,6,3,0.0,Olivia Taylor,olivia.taylor@out...,trx-756996252944,299.99
2023-09-05,85539,8.0,3.0,5,T-Shirt,Clothing,1,no,Credit Card,Olivia Davis,2019-12-08,Chicago,2015-03-10,A,6,4,0.05,John Moore,john.moore@gmail.com,trx-491216466700,19.99
2023-03-16,85517,7.0,4.0,15,Hoodie,Clothing,3,no,Credit Card,Christopher Miller,2018-07-05,Houston,2016-11-05,D,6,5,0.2,John Brown,john.brown@hotmai...,trx-274239612034,29.99
2022-09-15,85496,11.0,1.0,19,Sandals,Footwear,7,no,Credit Card,john wick,2018-07-10,New York,2017-01-15,A,6,6,0.0,Emma Smith,emma.smith@gmail.com,trx-054194579945,29.99
2022-09-24,85469,6.0,3.0,13,Printer,Electronics,3,no,Cash,Emma Taylor,2018-08-09,Chicago,2015-03-10,A,6,7,0.0,Emma Miller,emma.miller@outlo...,trx-217671445546,149.99
2022-09-08,85529,10.0,3.0,23,Toaster,Appliances,9,no,Credit Card,Sophia Moore,2019-11-10,Chicago,2015-03-10,A,6,8,0.0,Mia Williams,mia.williams@yaho...,trx-765298457963,39.99
2022-12-09,85523,7.0,1.0,25,Washing Machine,Appliances,1,no,Cash,Christopher Miller,2018-07-05,New York,2017-01-15,A,6,9,0.0,William Brown,william.brown@hot...,trx-469050698996,499.99


In [5]:
offline_merged_df.count()

505000

In [18]:
daily_sales_dump_df=offline_merged_df.select(
    col("sales_agent_name"),
    col("product_name"),
    col("units"))



In [19]:
daily_sales_dump_df = daily_sales_dump_df.groupBy("sales_agent_name", "product_name") \
              .agg(sum(col("units")).alias("total_sold_units")).orderBy("sales_agent_name")


daily_sales_dump_df.coalesce(1)

sales_agent_name,product_name,total_sold_units
Christopher Miller,Heels,8567
Christopher Miller,Hair Dryer,8501
Christopher Miller,Smartphone,9052
Christopher Miller,Headphones,8500
Christopher Miller,Microwave,8649
Christopher Miller,Coffee Maker,7978
Christopher Miller,T-Shirt,8343
Christopher Miller,Camera,8544
Christopher Miller,Electric Kettle,8562
Christopher Miller,Washing Machine,8716


In [21]:
output_path = "file:///data/daily_sales_dump"
daily_sales_dump_df.coalesce(1).write.option("header", "true").mode("overwrite").csv(output_path)



In [None]:
spark.stop()
