In [59]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.window import Window
from itertools import chain 
from typing import Dict
from pyspark.sql.functions import col, lit, when, coalesce, split, concat, udf, concat_ws, row_number, \
                                    regexp_replace, lower, monotonically_increasing_id, regexp_extract, create_map
import re 

In [60]:
spark = SparkSession\
    .builder\
    .appName("DataTransformedLayer")\
    .getOrCreate()


In [61]:
# Online Transactions Schema
online_transactions = StructType([
    StructField("transaction_date", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("product_id", LongType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False),
    StructField("row_index", LongType(), nullable=False),
    StructField("shipping_street_name", StringType(), nullable=False),
    StructField("shipping_city", StringType(), nullable=False),
    StructField("shipping_zip_code", StringType(), nullable=False),
    StructField("shipping_state", StringType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cleaned_email", StringType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False)
])

# Path to the directory containing the CSV files
online_merged = "/Graduation_Project/cleaned_data/online/*/*.csv"
online_merged_df = spark.read.option("header", "true").schema(online_transactions).csv(online_merged)
online_merged_df.coalesce(1)

transaction_date,customer_id,product_id,product_name,product_category,units,is_online,payment_method,group,row_index,shipping_street_name,shipping_city,shipping_zip_code,shipping_state,discount,customer_name,cleaned_email,transaction_id,unit_price
2023-06-02,85513,19,Sandals,Footwear,7,yes,Stripe,6,25769875038,700 Towering Oak ...,Glen Burnie,21061,Maryland,0.0,Alexander Johnson,alexander.johnson...,trx-035992640061,29.99
2022-12-13,85492,28,Hair Dryer,Appliances,8,yes,Stripe,6,25769875039,4728 Rockland Trail,Nashville,37013,Tennessee,0.0,Emma Taylor,emma.taylor@outlo...,trx-450105704924,19.99
2023-05-27,85556,16,Skirt,Clothing,7,yes,Credit Card,6,25769875040,44 Downey Drive,Manchester,6040,Connecticut,0.0,Ava Brown,ava.brown@yahoo.com,trx-184986835512,39.99
2022-03-23,85550,28,Hair Dryer,Appliances,9,yes,Credit Card,6,25769875041,3612 Jim Robison ...,Edmond,73013,Oklahoma,0.2,Sophia Davis,sophia.davis@yaho...,trx-934277502808,19.99
2022-03-06,85518,28,Hair Dryer,Appliances,10,yes,PayPal,6,25769875042,622 Elaine Drive,Nashville,37211,Tennessee,0.05,Alexander Davis,alexander.davis@o...,trx-420381525195,19.99
2022-08-21,85529,1,Laptop,Electronics,2,yes,PayPal,6,25769875043,129 Glen Street,West Brattleboro,5301,Vermont,0.2,Mia Williams,mia.williams@yaho...,trx-633490862447,999.99
2022-01-02,85538,22,Coffee Maker,Appliances,9,yes,PayPal,6,25769875044,88 Florence Street,New Bedford,2740,Massachusetts,0.0,Mia Johnson,mia.johnson@outlo...,trx-349483175929,79.99
2023-04-26,85557,22,Coffee Maker,Appliances,10,yes,PayPal,6,25769875045,8101 Vaughn Road,Montgomery,36117,Alabama,0.1,Olivia Davis,olivia.davis@gmai...,trx-157422050098,79.99
2022-09-22,85560,9,Boots,Footwear,10,yes,PayPal,6,25769875046,115 Maple Street,Manchester,6040,Connecticut,0.05,Emma Wilson,emma.wilson@gmail...,trx-732921927457,129.99
2022-09-22,85484,18,Boots,Footwear,7,yes,Credit Card,6,25769875047,169 Avenida Drive,Berkeley,94708,California,0.0,Olivia Wilson,olivia.wilson@yah...,trx-777834732693,149.99


In [62]:
# Calculate total price
online_merged_df = online_merged_df.withColumn("total_price", col("units") * col("unit_price") * (1 - col("discount")))

# Show updated DataFrame
online_merged_df.show(5, False, True)

-RECORD 0-------------------------------------------
 transaction_date     | 2023-06-02                  
 customer_id          | 85513                       
 product_id           | 19                          
 product_name         | Sandals                     
 product_category     | Footwear                    
 units                | 7                           
 is_online            | yes                         
 payment_method       | Stripe                      
 group                | 6                           
 row_index            | 25769875038                 
 shipping_street_name | 700 Towering Oak Court      
 shipping_city        | Glen Burnie                 
 shipping_zip_code    | 21061                       
 shipping_state       | Maryland                    
 discount             | 0.0                         
 customer_name        | Alexander Johnson           
 cleaned_email        | alexander.johnson@yahoo.com 
 transaction_id       | trx-035992640061      

In [63]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/online/merged_data.csv"
online_merged_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [64]:
#Offline Transactions Schema
offline_transactions = StructType([
    StructField("transaction_date", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("sales_agent_id", StringType(), nullable=False),
    StructField("branch_id", StringType(), nullable=False),
    StructField("product_id", IntegerType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("sales_agent_name", StringType(), nullable=False),
    StructField("sales_agent_hire_date", StringType(), nullable=False),
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", StringType(), nullable=False),
    StructField("branch_class", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False),
    StructField("row_index", StringType(), nullable=False),
    StructField("discount", FloatType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cleaned_email", StringType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False)
])

# Path to the directory containing the CSV files
offline_merged = "/Graduation_Project/cleaned_data/offline/*/*.csv"

# Read all CSV files in the directory into a single DataFrame without schema for verification
test2_df = spark.read.option("header", "true").csv(offline_merged)
offline_merged_df = spark.read.option("header", "true").schema(offline_transactions).csv(offline_merged)
offline_merged_df.coalesce(1)

transaction_date,customer_id,sales_agent_id,branch_id,product_id,product_name,product_category,units,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,discount,customer_name,cleaned_email,transaction_id,unit_price
2022-07-19,85513,10.0,6.0,27,Iron,Appliances,7,no,Cash,Sophia Moore,2019-11-10,Oklahoma,2016-09-20,A,6,0,0.0,Alexander Johnson,alexander.johnson...,trx-878108770002,29.99
2023-08-06,85510,2.0,6.0,28,Hair Dryer,Appliances,10,no,Cash,Jane Smith,2021-06-08,Oklahoma,2016-09-20,A,6,1,0.1,Ava Smith,ava.smith@gmail.com,trx-349443438637,19.99
2022-12-28,85553,6.0,4.0,13,Printer,Electronics,2,no,Credit Card,Emma Taylor,2018-08-09,Houston,2016-11-05,D,6,2,0.15,Mia Brown,mia.brown@hotmail...,trx-045891300294,149.99
2023-06-28,85520,3.0,2.0,12,Monitor,Electronics,6,no,Credit Card,Michael Johnson,2019-07-22,Los Angeles,2016-07-28,B,6,3,0.0,Olivia Taylor,olivia.taylor@out...,trx-756996252944,299.99
2023-09-05,85539,8.0,3.0,5,T-Shirt,Clothing,1,no,Credit Card,Olivia Davis,2019-12-08,Chicago,2015-03-10,A,6,4,0.05,John Moore,john.moore@gmail.com,trx-491216466700,19.99
2023-03-16,85517,7.0,4.0,15,Hoodie,Clothing,3,no,Credit Card,Christopher Miller,2018-07-05,Houston,2016-11-05,D,6,5,0.2,John Brown,john.brown@hotmai...,trx-274239612034,29.99
2022-09-15,85496,11.0,1.0,19,Sandals,Footwear,7,no,Credit Card,john wick,2018-07-10,New York,2017-01-15,A,6,6,0.0,Emma Smith,emma.smith@gmail.com,trx-054194579945,29.99
2022-09-24,85469,6.0,3.0,13,Printer,Electronics,3,no,Cash,Emma Taylor,2018-08-09,Chicago,2015-03-10,A,6,7,0.0,Emma Miller,emma.miller@outlo...,trx-217671445546,149.99
2022-09-08,85529,10.0,3.0,23,Toaster,Appliances,9,no,Credit Card,Sophia Moore,2019-11-10,Chicago,2015-03-10,A,6,8,0.0,Mia Williams,mia.williams@yaho...,trx-765298457963,39.99
2022-12-09,85523,7.0,1.0,25,Washing Machine,Appliances,1,no,Cash,Christopher Miller,2018-07-05,New York,2017-01-15,A,6,9,0.0,William Brown,william.brown@hot...,trx-469050698996,499.99


In [65]:
# Calculate total price
offline_merged_df = offline_merged_df.withColumn("total_price", col("units") * col("unit_price") * (1 - col("discount")))

offline_merged_df.show(5, False, True)

-RECORD 0--------------------------------------------
 transaction_date      | 2022-07-19                  
 customer_id           | 85513                       
 sales_agent_id        | 10.0                        
 branch_id             | 6.0                         
 product_id            | 27                          
 product_name          | Iron                        
 product_category      | Appliances                  
 units                 | 7                           
 is_online             | no                          
 payment_method        | Cash                        
 sales_agent_name      | Sophia Moore                
 sales_agent_hire_date | 2019-11-10                  
 branch_location       | Oklahoma                    
 branch_establish_date | 2016-09-20                  
 branch_class          | A                           
 group                 | 6                           
 row_index             | 0                           
 discount              | 0.0

In [66]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/offline/merged_data.csv"
offline_merged_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [67]:
# Define the schema
Alldata_transactions_schema = StructType([
    StructField("transaction_date", StringType(), nullable=False),
    StructField("customer_id", LongType(), nullable=False),
    StructField("sales_agent_id", DoubleType(), nullable=True),  
    StructField("branch_id", DoubleType(), nullable=True),       
    StructField("product_id", LongType(), nullable=False),
    StructField("product_name", StringType(), nullable=False),
    StructField("product_category", StringType(), nullable=False),
    StructField("units", IntegerType(), nullable=False),
    StructField("is_online", StringType(), nullable=False),
    StructField("payment_method", StringType(), nullable=False),
    StructField("sales_agent_name", StringType(), nullable=False),
    StructField("sales_agent_hire_date", StringType(), nullable=False), 
    StructField("branch_location", StringType(), nullable=False),
    StructField("branch_establish_date", StringType(), nullable=False),  
    StructField("branch_class", StringType(), nullable=False),
    StructField("group", StringType(), nullable=False),
    StructField("row_index", LongType(), nullable=False),
    StructField("shipping_street_name", StringType(), nullable=True),   
    StructField("shipping_city", StringType(), nullable=True),
    StructField("shipping_zip_code", StringType(), nullable=True),
    StructField("shipping_state", StringType(), nullable=True),
    StructField("discount", FloatType(), nullable=False),
    StructField("customer_name", StringType(), nullable=False),
    StructField("cleaned_email", StringType(), nullable=False),
    StructField("transaction_id", StringType(), nullable=False),
    StructField("unit_price", DoubleType(), nullable=False)
])

# Now you can read the CSV file with this schema in Spark
all_groups_merged_df = spark.read.csv("/Graduation_Project/cleaned_data/all_groups_merged.csv", schema=Alldata_transactions_schema ,header=True)

all_groups_merged_df.coalesce(1)

transaction_date,customer_id,sales_agent_id,branch_id,product_id,product_name,product_category,units,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,shipping_street_name,shipping_city,shipping_zip_code,shipping_state,discount,customer_name,cleaned_email,transaction_id,unit_price
2023-05-20,85469,1.0,2.0,22,Coffee Maker,Appliances,10,no,Cash,John Doe,2020-06-03,Los Angeles,2016-07-28,B,1,0,,,,,0.0,Alexander Brown,alexander.brown@g...,trx-152546429674,79.99
2022-10-25,85512,3.0,1.0,24,Blender,Appliances,5,no,Cash,Michael Johnson,2021-10-03,New York,2017-01-15,A,1,1,,,,,0.2,William Brown,william.brown@gma...,trx-291375327542,49.99
2022-02-05,85484,10.0,3.0,4,Headphones,Electronics,1,no,Credit Card,Sophia Moore,2019-05-25,Chicago,2015-03-10,A,1,2,,,,,0.0,John Williams,john.williams@gma...,trx-312507679871,99.99
2023-10-20,85528,7.0,2.0,25,Washing Machine,Appliances,8,no,Cash,Christopher Miller,2020-01-11,Los Angeles,2016-07-28,B,1,3,,,,,0.0,Alexander Miller,alexander.miller@...,trx-193384855491,499.99
2022-11-17,85500,5.0,1.0,14,Camera,Electronics,10,no,Cash,David Wilson,2021-04-08,New York,2017-01-15,A,1,4,,,,,0.15,John Brown,john.brown@hotmai...,trx-831626097654,399.99
2022-09-27,85545,4.0,5.0,14,Camera,Electronics,6,no,Credit Card,Emily Brown,2020-10-25,Phoenix,2017-09-20,C,1,5,,,,,0.2,Sophia Wilson,sophia.wilson@hot...,trx-158496122054,399.99
2022-04-21,85561,4.0,1.0,30,Electric Kettle,Appliances,6,no,Credit Card,Emily Brown,2020-10-25,New York,2017-01-15,A,1,6,,,,,0.2,Alexander Moore,alexander.moore@y...,trx-722817999024,24.99
2023-04-28,85520,1.0,1.0,26,Vacuum Cleaner,Appliances,4,no,Cash,John Doe,2020-06-03,New York,2017-01-15,A,1,7,,,,,0.0,Alexander Wilson,alexander.wilson@...,trx-813287633702,199.99
2023-03-08,85488,6.0,2.0,18,Boots,Footwear,10,no,Credit Card,Emma Taylor,2019-03-28,Los Angeles,2016-07-28,B,1,8,,,,,0.0,Michael Miller,michael.miller@ya...,trx-219568257432,149.99
2023-06-17,85466,5.0,2.0,16,Skirt,Clothing,8,no,Cash,David Wilson,2021-04-08,Los Angeles,2016-07-28,B,1,9,,,,,0.0,Michael Brown,michael.brown@yah...,trx-352160720823,39.99


In [68]:
# Calculate total price
all_groups_merged_df = all_groups_merged_df.withColumn("total_price", col("units") * col("unit_price") * (1 - col("discount")))

all_groups_merged_df.coalesce(1)

transaction_date,customer_id,sales_agent_id,branch_id,product_id,product_name,product_category,units,is_online,payment_method,sales_agent_name,sales_agent_hire_date,branch_location,branch_establish_date,branch_class,group,row_index,shipping_street_name,shipping_city,shipping_zip_code,shipping_state,discount,customer_name,cleaned_email,transaction_id,unit_price,total_price
2023-05-20,85469,1.0,2.0,22,Coffee Maker,Appliances,10,no,Cash,John Doe,2020-06-03,Los Angeles,2016-07-28,B,1,0,,,,,0.0,Alexander Brown,alexander.brown@g...,trx-152546429674,79.99,799.9
2022-10-25,85512,3.0,1.0,24,Blender,Appliances,5,no,Cash,Michael Johnson,2021-10-03,New York,2017-01-15,A,1,1,,,,,0.2,William Brown,william.brown@gma...,trx-291375327542,49.99,199.96000297963624
2022-02-05,85484,10.0,3.0,4,Headphones,Electronics,1,no,Credit Card,Sophia Moore,2019-05-25,Chicago,2015-03-10,A,1,2,,,,,0.0,John Williams,john.williams@gma...,trx-312507679871,99.99,99.99
2023-10-20,85528,7.0,2.0,25,Washing Machine,Appliances,8,no,Cash,Christopher Miller,2020-01-11,Los Angeles,2016-07-28,B,1,3,,,,,0.0,Alexander Miller,alexander.miller@...,trx-193384855491,499.99,3999.92
2022-11-17,85500,5.0,1.0,14,Camera,Electronics,10,no,Cash,David Wilson,2021-04-08,New York,2017-01-15,A,1,4,,,,,0.15,John Brown,john.brown@hotmai...,trx-831626097654,399.99,3399.915095365048
2022-09-27,85545,4.0,5.0,14,Camera,Electronics,6,no,Credit Card,Emily Brown,2020-10-25,Phoenix,2017-09-20,C,1,5,,,,,0.2,Sophia Wilson,sophia.wilson@hot...,trx-158496122054,399.99,1919.9520286095144
2022-04-21,85561,4.0,1.0,30,Electric Kettle,Appliances,6,no,Credit Card,Emily Brown,2020-10-25,New York,2017-01-15,A,1,6,,,,,0.2,Alexander Moore,alexander.moore@y...,trx-722817999024,24.99,119.95200178742408
2023-04-28,85520,1.0,1.0,26,Vacuum Cleaner,Appliances,4,no,Cash,John Doe,2020-06-03,New York,2017-01-15,A,1,7,,,,,0.0,Alexander Wilson,alexander.wilson@...,trx-813287633702,199.99,799.96
2023-03-08,85488,6.0,2.0,18,Boots,Footwear,10,no,Credit Card,Emma Taylor,2019-03-28,Los Angeles,2016-07-28,B,1,8,,,,,0.0,Michael Miller,michael.miller@ya...,trx-219568257432,149.99,1499.9
2023-06-17,85466,5.0,2.0,16,Skirt,Clothing,8,no,Cash,David Wilson,2021-04-08,Los Angeles,2016-07-28,B,1,9,,,,,0.0,Michael Brown,michael.brown@yah...,trx-352160720823,39.99,319.92


In [69]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/all_groups_merged_df.csv"
all_groups_merged_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [70]:
from pyspark.sql.functions import col, dayofmonth, dayofweek, dayofyear, month, year, weekofyear, quarter, regexp_replace, date_format
from datetime import datetime, timedelta

# Generate Date Range (Example: from 2023-01-01 to 2023-01-05)
start_date = datetime(2010, 1, 1)
end_date = datetime(2030, 12, 31)
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

# Create DataFrame
date_df = spark.createDataFrame([(date,) for date in date_list], ["date"])

# Convert date column to match transaction_date format (yyyy-MM-dd)
date_df = date_df.withColumn("date", date_format(col("date"), "yyyy-MM-dd"))

# Add Surrogate Key (yyyyMMdd format)
date_df = date_df.withColumn("date_key", 
                             regexp_replace(date_format(col("date"), "yyyyMMdd"), "-", "").cast("long"))

# Extract Date Components
date_df = date_df.withColumn("day", dayofmonth(col("date"))) \
    .withColumn("day_of_week", dayofweek(col("date"))) \
    .withColumn("day_of_year", dayofyear(col("date"))) \
    .withColumn("week_of_year", weekofyear(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("quarter", quarter(col("date"))) \
    .withColumn("year", year(col("date"))) \
    .withColumn("is_weekend", (col("day_of_week") >= 6).cast("integer")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("year_month", date_format(col("date"), "yyyyMM"))

# Show the DataFrame
date_df.show()

output_path = "/Graduation_Project/transformed_data/schema/date_dim.csv"
date_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)


+----------+--------+---+-----------+-----------+------------+-----+-------+----+----------+----------+---------+----------+
|      date|date_key|day|day_of_week|day_of_year|week_of_year|month|quarter|year|is_weekend|month_name| day_name|year_month|
+----------+--------+---+-----------+-----------+------------+-----+-------+----+----------+----------+---------+----------+
|2010-01-01|20100101|  1|          6|          1|          53|    1|      1|2010|         1|   January|   Friday|    201001|
|2010-01-02|20100102|  2|          7|          2|          53|    1|      1|2010|         1|   January| Saturday|    201001|
|2010-01-03|20100103|  3|          1|          3|          53|    1|      1|2010|         0|   January|   Sunday|    201001|
|2010-01-04|20100104|  4|          2|          4|           1|    1|      1|2010|         0|   January|   Monday|    201001|
|2010-01-05|20100105|  5|          3|          5|           1|    1|      1|2010|         0|   January|  Tuesday|    201001|


In [71]:
offline_fact_df=offline_merged_df.select(
    col("transaction_id"),
    col("transaction_date"),
    col("customer_id"),
    col("sales_agent_id"),
    col("branch_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("total_price"))
offline_fact_df.show(5)

+----------------+----------------+-----------+--------------+---------+----------+-----+----------+--------+--------------+------------------+
|  transaction_id|transaction_date|customer_id|sales_agent_id|branch_id|product_id|units|unit_price|discount|payment_method|       total_price|
+----------------+----------------+-----------+--------------+---------+----------+-----+----------+--------+--------------+------------------+
|trx-878108770002|      2022-07-19|      85513|          10.0|      6.0|        27|    7|     29.99|     0.0|          Cash|209.92999999999998|
|trx-349443438637|      2023-08-06|      85510|           2.0|      6.0|        28|   10|     19.99|     0.1|          Cash|179.90999523401257|
|trx-045891300294|      2022-12-28|      85553|           6.0|      4.0|        13|    2|    149.99|    0.15|   Credit Card|254.98300715208055|
|trx-756996252944|      2023-06-28|      85520|           3.0|      2.0|        12|    6|    299.99|     0.0|   Credit Card|           1

In [72]:

# Convert transaction_date in online_fact_df to match the format yyyy-MM-dd
offline_fact_df = offline_fact_df.withColumn("transaction_date", date_format(col("transaction_date"), "yyyy-MM-dd"))

# Perform an inner join on transaction_date and date
joined_df = offline_fact_df.join(date_df, offline_fact_df["transaction_date"] == date_df["date"], "inner")

joined_df.coalesce(1)




transaction_id,transaction_date,customer_id,sales_agent_id,branch_id,product_id,units,unit_price,discount,payment_method,total_price,date,date_key,day,day_of_week,day_of_year,week_of_year,month,quarter,year,is_weekend,month_name,day_name,year_month
trx-407964138068,2022-10-05,85479,4.0,5.0,25,8,499.99,0.1,Cash,3599.927904634476,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-843694845999,2022-10-05,85506,11.0,1.0,15,10,29.99,0.05,Credit Card,284.9049964249134,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-548311587674,2022-10-05,85541,5.0,3.0,19,10,29.99,0.05,Cash,284.9049964249134,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-611821275884,2022-10-05,85470,7.0,5.0,25,1,499.99,0.0,Credit Card,499.99,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-527486306638,2022-10-05,85544,10.0,2.0,23,5,39.99,0.1,Cash,179.95499523282052,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-012682136132,2022-10-05,85480,1.0,4.0,5,4,19.99,0.1,Cash,71.96399809360504,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-167057015608,2022-10-05,85519,8.0,2.0,7,10,59.99,0.0,Cash,599.9,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-655869782401,2022-10-05,85527,8.0,2.0,28,10,19.99,0.2,Cash,159.9200023829937,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-725477728975,2022-10-05,85462,4.0,6.0,18,6,149.99,0.0,Cash,899.94,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210
trx-079756546639,2022-10-05,85531,8.0,2.0,16,5,39.99,0.0,Credit Card,199.95,2022-10-05,20221005,5,4,278,40,10,4,2022,0,October,Wednesday,202210


In [73]:
joined_df.columns

['transaction_id',
 'transaction_date',
 'customer_id',
 'sales_agent_id',
 'branch_id',
 'product_id',
 'units',
 'unit_price',
 'discount',
 'payment_method',
 'total_price',
 'date',
 'date_key',
 'day',
 'day_of_week',
 'day_of_year',
 'week_of_year',
 'month',
 'quarter',
 'year',
 'is_weekend',
 'month_name',
 'day_name',
 'year_month']

In [74]:
offline_fact_df=joined_df.select(
    col("transaction_id"),
    col("customer_id"),
    col("sales_agent_id"),
    col("branch_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("total_price"),
    col("date_key")
)

offline_fact_df.coalesce(1)

transaction_id,customer_id,sales_agent_id,branch_id,product_id,units,unit_price,discount,payment_method,total_price,date_key
trx-407964138068,85479,4.0,5.0,25,8,499.99,0.1,Cash,3599.927904634476,20221005
trx-843694845999,85506,11.0,1.0,15,10,29.99,0.05,Credit Card,284.9049964249134,20221005
trx-548311587674,85541,5.0,3.0,19,10,29.99,0.05,Cash,284.9049964249134,20221005
trx-611821275884,85470,7.0,5.0,25,1,499.99,0.0,Credit Card,499.99,20221005
trx-527486306638,85544,10.0,2.0,23,5,39.99,0.1,Cash,179.95499523282052,20221005
trx-012682136132,85480,1.0,4.0,5,4,19.99,0.1,Cash,71.96399809360504,20221005
trx-167057015608,85519,8.0,2.0,7,10,59.99,0.0,Cash,599.9,20221005
trx-655869782401,85527,8.0,2.0,28,10,19.99,0.2,Cash,159.9200023829937,20221005
trx-725477728975,85462,4.0,6.0,18,6,149.99,0.0,Cash,899.94,20221005
trx-079756546639,85531,8.0,2.0,16,5,39.99,0.0,Credit Card,199.95,20221005


In [75]:
online_fact_df=online_merged_df.select(
    col("transaction_date"),
    col("transaction_id"),
    col("customer_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("group"),
    col("shipping_street_name"),
    col("shipping_city"),
    col("shipping_state"),
    col("shipping_zip_code"),
    col("total_price")
)

online_fact_df.coalesce(1)

transaction_date,transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,total_price
2023-06-02,trx-035992640061,85513,19,7,29.99,0.0,Stripe,6,700 Towering Oak ...,Glen Burnie,Maryland,21061,209.93
2022-12-13,trx-450105704924,85492,28,8,19.99,0.0,Stripe,6,4728 Rockland Trail,Nashville,Tennessee,37013,159.92
2023-05-27,trx-184986835512,85556,16,7,39.99,0.0,Credit Card,6,44 Downey Drive,Manchester,Connecticut,6040,279.93
2022-03-23,trx-934277502808,85550,28,9,19.99,0.2,Credit Card,6,3612 Jim Robison ...,Edmond,Oklahoma,73013,143.92800214469432
2022-03-06,trx-420381525195,85518,28,10,19.99,0.05,PayPal,6,622 Elaine Drive,Nashville,Tennessee,37211,189.90499761700627
2022-08-21,trx-633490862447,85529,1,2,999.99,0.2,PayPal,6,129 Glen Street,West Brattleboro,Vermont,5301,1599.9840238416195
2022-01-02,trx-349483175929,85538,22,9,79.99,0.0,PayPal,6,88 Florence Street,New Bedford,Massachusetts,2740,719.91
2023-04-26,trx-157422050098,85557,22,10,79.99,0.1,PayPal,6,8101 Vaughn Road,Montgomery,Alabama,36117,719.9099809288979
2022-09-22,trx-732921927457,85560,9,10,129.99,0.05,PayPal,6,115 Maple Street,Manchester,Connecticut,6040,1234.9049845039845
2022-09-22,trx-777834732693,85484,18,7,149.99,0.0,Credit Card,6,169 Avenida Drive,Berkeley,California,94708,1049.93


In [76]:
# Convert transaction_date in online_fact_df to match the format yyyy-MM-dd
online_fact_df = online_fact_df.withColumn("transaction_date", date_format(col("transaction_date"), "yyyy-MM-dd"))

# Perform an inner join on transaction_date and date
joined_df_2 = online_fact_df.join(date_df, online_fact_df["transaction_date"] == date_df["date"], "inner")

joined_df_2.coalesce(1)



transaction_date,transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,total_price,date,date_key,day,day_of_week,day_of_year,week_of_year,month,quarter,year,is_weekend,month_name,day_name,year_month
2022-01-01,trx-038966531476,85511,11,9,899.99,0.0,PayPal,2,1783 Blakely Road,Colchester,Vermont,5446,8099.91,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-196650129641,85502,8,1,79.99,0.0,Stripe,2,88 Shute Street,Everett,Massachusetts,2149,79.99,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-961186738199,85540,9,8,129.99,0.2,PayPal,3,45 Parsons Avenue,Saint Albans City,Vermont,5478,831.9360123968125,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-961186738199,85540,9,8,129.99,0.2,PayPal,1,45 Parsons Avenue,Saint Albans City,Vermont,5478,831.9360123968125,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-807433736285,85505,24,9,49.99,0.2,Stripe,4,6460 Vermont 113,Vershire,Vermont,5079,359.9280053633452,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-378573229506,85552,7,1,59.99,0.05,Credit Card,6,1265 Xavier Avenue,Hayward,California,94545,56.99049928486348,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-812311767786,85525,11,8,899.99,0.2,Stripe,6,129 Glen Street,West Brattleboro,Vermont,5301,5759.936085829735,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-714156427993,85519,5,4,19.99,0.0,PayPal,6,52 Linnmore Drive,Manchester,Connecticut,6040,79.96,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-926752978689,85553,15,10,29.99,0.0,PayPal,6,1313 North Road,Hinesburg,Vermont,5461,299.9,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-377988149376,85548,30,6,24.99,0.0,Stripe,6,3328 Sunny Meadow...,Birmingham,Alabama,35242,149.94,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201


In [77]:
joined_df_2.columns

['transaction_date',
 'transaction_id',
 'customer_id',
 'product_id',
 'units',
 'unit_price',
 'discount',
 'payment_method',
 'group',
 'shipping_street_name',
 'shipping_city',
 'shipping_state',
 'shipping_zip_code',
 'total_price',
 'date',
 'date_key',
 'day',
 'day_of_week',
 'day_of_year',
 'week_of_year',
 'month',
 'quarter',
 'year',
 'is_weekend',
 'month_name',
 'day_name',
 'year_month']

In [78]:
online_fact_df=joined_df_2.select(
    col("transaction_id"),
    col("customer_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("group"),
    col("total_price"),
    col("date_key"),
    col("shipping_zip_code"),
    col("shipping_state"),
    col("shipping_city"),
    col("shipping_street_name"))

online_fact_df.coalesce(1)

transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name
trx-038966531476,85511,11,9,899.99,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road
trx-196650129641,85502,8,1,79.99,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street
trx-961186738199,85540,9,8,129.99,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue
trx-961186738199,85540,9,8,129.99,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue
trx-807433736285,85505,24,9,49.99,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113
trx-378573229506,85552,7,1,59.99,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue
trx-812311767786,85525,11,8,899.99,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street
trx-714156427993,85519,5,4,19.99,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive
trx-926752978689,85553,15,10,29.99,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road
trx-377988149376,85548,30,6,24.99,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...


In [79]:
product_dim_df=all_groups_merged_df.select(
    col("product_id"),
    col("product_name"),
    col("product_category"),
    col("unit_price")).distinct()

product_dim_df.show()

+----------+-----------------+----------------+----------+
|product_id|     product_name|product_category|unit_price|
+----------+-----------------+----------------+----------+
|        29|Hair Straightener|      Appliances|     39.99|
|        19|          Sandals|        Footwear|     29.99|
|        12|          Monitor|     Electronics|    299.99|
|        17|           Blouse|        Clothing|     29.99|
|        18|            Boots|        Footwear|    149.99|
|        15|           Hoodie|        Clothing|     29.99|
|        10|          Sandals|        Footwear|     39.99|
|         1|           Laptop|     Electronics|    999.99|
|         6|            Jeans|        Clothing|     49.99|
|        16|            Skirt|        Clothing|     39.99|
|         7|            Dress|        Clothing|     59.99|
|        28|       Hair Dryer|      Appliances|     19.99|
|         3|           Tablet|     Electronics|    299.99|
|        14|           Camera|     Electronics|    399.9

In [80]:
# Define a Window specification
window_spec = Window.orderBy("product_id")

# Add a product_key column using row_number
product_dim_df = product_dim_df.withColumn("product_key", row_number().over(window_spec))

product_dim_df.coalesce(1)

product_id,product_name,product_category,unit_price,product_key
1,Laptop,Electronics,999.99,1
2,Smartphone,Electronics,699.99,2
3,Tablet,Electronics,299.99,3
4,Headphones,Electronics,99.99,4
5,T-Shirt,Clothing,19.99,5
6,Jeans,Clothing,49.99,6
7,Dress,Clothing,59.99,7
8,Sneakers,Footwear,79.99,8
9,Boots,Footwear,129.99,9
10,Sandals,Footwear,39.99,10


In [81]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/products_dim.csv"
product_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [82]:
# Perform an inner join on transaction_date and date
online_fact_df = online_fact_df.join(product_dim_df, online_fact_df["product_id"] == product_dim_df["product_id"], "inner")

In [83]:
online_fact_df.coalesce(1)

transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_id.1,product_name,product_category,unit_price.1,product_key
trx-549713340293,85522,26,8,199.99,0.0,Stripe,6,1599.92,20220101,21114,Maryland,Crofton,1630 Eton Way,26,Vacuum Cleaner,Appliances,199.99,26
trx-120912375499,85483,26,4,199.99,0.0,Credit Card,6,799.96,20220102,1902,Massachusetts,Lynn,172 Chestnut Street,26,Vacuum Cleaner,Appliances,199.99,26
trx-500133384271,85504,26,3,199.99,0.0,PayPal,6,599.97,20220102,1719,Massachusetts,Boxborough,302 Depot Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-741895782895,85550,26,4,199.99,0.0,Credit Card,6,799.96,20220102,5075,Vermont,Thetford,398 Gove Hill Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-830547671067,85464,26,5,199.99,0.0,Stripe,6,999.95,20220103,99611,Alaska,Kenai,51185 Helmsman St...,26,Vacuum Cleaner,Appliances,199.99,26
trx-087732089100,85534,26,5,199.99,0.15,PayPal,6,849.9575238406659,20220103,21122,Maryland,Pasadena,202 Winston Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-227505126526,85504,26,10,199.99,0.15,Stripe,6,1699.9150476813318,20220103,6040,Connecticut,Manchester,89 Frances Drive,26,Vacuum Cleaner,Appliances,199.99,26
trx-512595649334,85561,26,8,199.99,0.0,Credit Card,6,1599.92,20220103,85304,Arizona,Glendale,13066 North 56th ...,26,Vacuum Cleaner,Appliances,199.99,26
trx-950565917451,85512,26,5,199.99,0.0,Stripe,6,999.95,20220103,21060,Maryland,Glen Burnie,140 William Chamb...,26,Vacuum Cleaner,Appliances,199.99,26
trx-361642366546,85526,26,5,199.99,0.2,Credit Card,6,799.9600119203329,20220104,36116,Alabama,Montgomery,4560 Hurlston Drive,26,Vacuum Cleaner,Appliances,199.99,26


In [84]:
online_fact_df=online_fact_df.drop("product_id", "product_name", "product_category", "unit_price")
online_fact_df.coalesce(1)

transaction_id,customer_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key
trx-038966531476,85511,9,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road,11
trx-196650129641,85502,1,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street,8
trx-961186738199,85540,8,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9
trx-961186738199,85540,8,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9
trx-807433736285,85505,9,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113,24
trx-378573229506,85552,1,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue,7
trx-812311767786,85525,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11
trx-714156427993,85519,4,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive,5
trx-926752978689,85553,10,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road,15
trx-377988149376,85548,6,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...,30


In [85]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(product_dim_df, offline_fact_df["product_id"] == product_dim_df["product_id"], "inner")


In [86]:
offline_fact_df=offline_fact_df.drop("product_id", "product_name", "product_category", "unit_price")
offline_fact_df.coalesce(1)

transaction_id,customer_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key
trx-407964138068,85479,4.0,5.0,8,0.1,Cash,3599.927904634476,20221005,25
trx-843694845999,85506,11.0,1.0,10,0.05,Credit Card,284.9049964249134,20221005,15
trx-548311587674,85541,5.0,3.0,10,0.05,Cash,284.9049964249134,20221005,19
trx-611821275884,85470,7.0,5.0,1,0.0,Credit Card,499.99,20221005,25
trx-527486306638,85544,10.0,2.0,5,0.1,Cash,179.95499523282052,20221005,23
trx-012682136132,85480,1.0,4.0,4,0.1,Cash,71.96399809360504,20221005,5
trx-167057015608,85519,8.0,2.0,10,0.0,Cash,599.9,20221005,7
trx-655869782401,85527,8.0,2.0,10,0.2,Cash,159.9200023829937,20221005,28
trx-725477728975,85462,4.0,6.0,6,0.0,Cash,899.94,20221005,18
trx-079756546639,85531,8.0,2.0,5,0.0,Credit Card,199.95,20221005,16


In [87]:
customer_dim_df=all_groups_merged_df.select(
    col("customer_id"),
    col("customer_name"),
    col("cleaned_email")).dropDuplicates(["customer_id"])

customer_dim_df.show()

+-----------+------------------+--------------------+
|customer_id|     customer_name|       cleaned_email|
+-----------+------------------+--------------------+
|      85525|         Ava Jones| ava.jones@yahoo.com|
|      85527|       Emma Miller|emma.miller@outlo...|
|      85520|  Alexander Wilson|alexander.wilson@...|
|      85541|         Ava Jones|ava.jones@hotmail...|
|      85473|     Sophia Miller|sophia.miller@yah...|
|      85514|     Michael Smith|michael.smith@out...|
|      85547|      James Wilson|james.wilson@outl...|
|      85508|      James Miller|james.miller@hotm...|
|      85542|      James Miller|james.miller@yaho...|
|      85506|      Mia Williams|mia.williams@hotm...|
|      85515|    William Wilson|william.wilson@ou...|
|      85511|   Alexander Moore|alexander.moore@y...|
|      85477|Alexander Williams|alexander.william...|
|      85554|        John Brown|john.brown@gmail.com|
|      85490|        Mia Taylor|mia.taylor@yahoo.com|
|      85516|  Michael Willi

In [88]:
# Define a Window specification
window_spec = Window.orderBy("customer_id")

# Add a product_key column using row_number
customer_dim_df = customer_dim_df.withColumn("customer_key", row_number().over(window_spec))

customer_dim_df.coalesce(1)

customer_id,customer_name,cleaned_email,customer_key
85462,Olivia Brown,olivia.brown@yaho...,1
85463,Ava Miller,ava.miller@gmail.com,2
85464,Alexander Moore,alexander.moore@o...,3
85465,James Taylor,james.taylor@gmai...,4
85466,Michael Brown,michael.brown@yah...,5
85467,Alexander Jones,alexander.jones@o...,6
85468,William Davis,william.davis@yah...,7
85469,Emma Miller,emma.miller@outlo...,8
85470,William Williams,william.williams@...,9
85471,Ava Williams,ava.williams@outl...,10


In [89]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/customer_dim.csv"
customer_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [90]:
# Perform an inner join on transaction_date and date
online_fact_df = online_fact_df.join(customer_dim_df, online_fact_df["customer_id"] == customer_dim_df["customer_id"], "inner")
online_fact_df.coalesce(1)

transaction_id,customer_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key,customer_id.1,customer_name,cleaned_email,customer_key
trx-812311767786,85525,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11,85525,Michael Wilson,michael.wilson@gm...,64
trx-777696509076,85525,10,0.15,PayPal,5,509.9150143027305,20220103,6040,Connecticut,Manchester,81 Lyness Street,7,85525,Michael Wilson,michael.wilson@gm...,64
trx-961752173520,85525,5,0.15,PayPal,6,254.9575071513653,20220103,6040,Connecticut,Manchester,117 Adelaide Road,7,85525,Michael Wilson,michael.wilson@gm...,64
trx-752459780179,85525,9,0.05,Credit Card,6,1111.4144860535862,20220103,85310,Arizona,Glendale,23124 North 71st ...,21,85525,Michael Wilson,michael.wilson@gm...,64
trx-895691945634,85525,5,0.2,Credit Card,6,159.96000238358977,20220104,5905,Vermont,Maidstone,3541 North Road,23,85525,Michael Wilson,michael.wilson@gm...,64
trx-985459269846,85525,8,0.05,Credit Card,6,1139.923985695839,20220106,2143,Massachusetts,Somerville,13 Warren Avenue,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-667598624256,85525,10,0.0,Stripe,6,1299.9,20220106,37209,Tennessee,Nashville,410 51st Avenue N...,21,85525,Michael Wilson,michael.wilson@gm...,64
trx-566935476646,85525,9,0.1,Credit Card,6,404.9189892733097,20220106,6040,Connecticut,Manchester,46 Lilac Street,24,85525,Michael Wilson,michael.wilson@gm...,64
trx-574027283390,85525,1,0.15,PayPal,6,84.99150238394736,20220107,20001,District of Columbia,Washington,81 Seaton Place N...,4,85525,Michael Wilson,michael.wilson@gm...,64
trx-037078085183,85525,8,0.0,PayPal,6,1199.92,20220107,80003,Colorado,Arvada,7912 Depew Street,13,85525,Michael Wilson,michael.wilson@gm...,64


In [91]:
online_fact_df=online_fact_df.drop("customer_id","cleaned_email","customer_name")
online_fact_df.coalesce(1)

transaction_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key,customer_key
trx-038966531476,9,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road,11,50
trx-196650129641,1,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street,8,41
trx-961186738199,8,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9,79
trx-961186738199,8,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9,79
trx-807433736285,9,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113,24,44
trx-378573229506,1,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue,7,91
trx-812311767786,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11,64
trx-714156427993,4,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive,5,58
trx-926752978689,10,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road,15,92
trx-377988149376,6,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...,30,87


In [92]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(customer_dim_df, offline_fact_df["customer_id"] == customer_dim_df["customer_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,customer_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_id.1,customer_name,cleaned_email,customer_key
trx-170004611332,85525,7.0,5.0,10,0.05,Cash,237.40499702095985,20220527,30,85525,Michael Wilson,michael.wilson@gm...,64
trx-844036209605,85525,10.0,4.0,7,0.0,Credit Card,1049.93,20220527,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-109416203740,85525,11.0,6.0,1,0.0,Credit Card,39.99,20220527,23,85525,Michael Wilson,michael.wilson@gm...,64
trx-106504733120,85525,10.0,6.0,7,0.0,Credit Card,4899.93,20220527,2,85525,Michael Wilson,michael.wilson@gm...,64
trx-745057689089,85525,9.0,6.0,5,0.2,Credit Card,1199.9600178807975,20220527,12,85525,Michael Wilson,michael.wilson@gm...,64
trx-714247080040,85525,7.0,4.0,8,0.0,Cash,639.92,20220718,8,85525,Michael Wilson,michael.wilson@gm...,64
trx-115302227341,85525,9.0,6.0,8,0.0,Cash,7999.92,20220718,1,85525,Michael Wilson,michael.wilson@gm...,64
trx-991057034635,85525,5.0,4.0,7,0.1,Cash,503.9369866502285,20220718,8,85525,Michael Wilson,michael.wilson@gm...,64
trx-815328334349,85525,10.0,5.0,7,0.0,Credit Card,1049.93,20220718,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-197754917684,85525,1.0,3.0,9,0.0,Credit Card,359.91,20220718,10,85525,Michael Wilson,michael.wilson@gm...,64


In [93]:
offline_fact_df=offline_fact_df.drop( "customer_id")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key
trx-823890000599,8.0,6.0,2,0.0,Credit Card,1999.98,20220312,1,Michael Wilson,michael.wilson@gm...,64
trx-398777947283,9.0,4.0,6,0.0,Credit Card,179.94,20220312,27,Michael Wilson,michael.wilson@gm...,64
trx-989278595733,9.0,6.0,6,0.0,Cash,779.94,20220312,21,Michael Wilson,michael.wilson@gm...,64
trx-273180803756,3.0,1.0,1,0.2,Credit Card,63.99200095355511,20220312,22,Michael Wilson,michael.wilson@gm...,64
trx-717300843037,5.0,2.0,1,0.2,Credit Card,799.9920119208098,20220312,1,Michael Wilson,michael.wilson@gm...,64
trx-864607890708,8.0,2.0,10,0.2,Credit Card,7199.920107287168,20220325,11,Michael Wilson,michael.wilson@gm...,64
trx-834287510675,2.0,5.0,6,0.0,Credit Card,359.94,20220325,7,Michael Wilson,michael.wilson@gm...,64
trx-395878299405,4.0,3.0,6,0.0,Cash,779.94,20220325,9,Michael Wilson,michael.wilson@gm...,64
trx-505177293904,11.0,3.0,8,0.0,Credit Card,2399.92,20220325,3,Michael Wilson,michael.wilson@gm...,64
trx-249842381834,7.0,5.0,10,0.0,Credit Card,499.9,20220325,6,Michael Wilson,michael.wilson@gm...,64


In [94]:
branches_dim_df=offline_merged_df.select(
    col("branch_id"),
    col("branch_location"),
    col("branch_establish_date"),
    col("branch_class")).distinct()

branches_dim_df.show()

+---------+---------------+---------------------+------------+
|branch_id|branch_location|branch_establish_date|branch_class|
+---------+---------------+---------------------+------------+
|      3.0|        Chicago|           2015-03-10|           A|
|      5.0|        Phoenix|           2017-09-20|           C|
|      1.0|       New York|           2017-01-15|           A|
|      2.0|    Los Angeles|           2016-07-28|           B|
|      4.0|        Houston|           2016-11-05|           D|
|      6.0|       Oklahoma|           2016-09-20|           A|
+---------+---------------+---------------------+------------+



In [95]:
# Define a Window specification
window_spec = Window.orderBy("branch_id")

# Add a product_key column using row_number
branches_dim_df = branches_dim_df.withColumn("branch_key", row_number().over(window_spec))

branches_dim_df.coalesce(1)

branch_id,branch_location,branch_establish_date,branch_class,branch_key
1.0,New York,2017-01-15,A,1
2.0,Los Angeles,2016-07-28,B,2
3.0,Chicago,2015-03-10,A,3
4.0,Houston,2016-11-05,D,4
5.0,Phoenix,2017-09-20,C,5
6.0,Oklahoma,2016-09-20,A,6


In [96]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/branches_dim.csv"
branches_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [97]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(branches_dim_df, offline_fact_df["branch_id"] == branches_dim_df["branch_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_id.1,branch_location,branch_establish_date,branch_class,branch_key
trx-752036065952,5.0,1.0,2,0.0,Credit Card,199.98,20220412,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-966555221163,4.0,1.0,1,0.0,Credit Card,149.99,20221008,13,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-438034122453,8.0,1.0,9,0.15,Credit Card,382.4235107266903,20221008,24,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-364183758782,3.0,1.0,3,0.15,Credit Card,76.47450214505196,20221201,15,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-861589695084,8.0,1.0,8,0.2,Credit Card,3199.9360476827624,20220718,25,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-723526461132,8.0,1.0,8,0.0,Credit Card,159.92,20220402,28,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-128824540336,3.0,1.0,6,0.0,Credit Card,599.9399999999999,20220402,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-128824540336,3.0,1.0,6,0.0,Credit Card,599.9399999999999,20220402,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-389151782527,6.0,1.0,9,0.05,Cash,1709.914478543401,20220402,26,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-921810114081,5.0,1.0,6,0.0,Credit Card,179.94,20220402,27,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1


In [98]:
offline_fact_df=offline_fact_df.drop("branch_class", "branch_establish_date", "branch_location","branch_id")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key
trx-288544269912,2.0,4,0.2,Credit Card,2879.9680429148675,20221005,11,Michael Wilson,michael.wilson@gm...,64,2
trx-453952256916,5.0,2,0.0,Cash,79.98,20221005,10,Michael Wilson,michael.wilson@gm...,64,6
trx-320018303559,7.0,10,0.2,Cash,319.92000476717953,20221005,10,Michael Wilson,michael.wilson@gm...,64,4
trx-024932783521,1.0,6,0.0,Cash,2999.94,20221005,25,Michael Wilson,michael.wilson@gm...,64,3
trx-209702051200,11.0,5,0.0,Cash,1499.95,20221005,12,Michael Wilson,michael.wilson@gm...,64,1
trx-267818677599,3.0,5,0.0,Credit Card,399.95,20221005,22,Michael Wilson,michael.wilson@gm...,64,2
trx-956064519174,11.0,5,0.0,Cash,199.95,20221005,29,Michael Wilson,michael.wilson@gm...,64,3
trx-480863914129,11.0,2,0.0,Credit Card,79.98,20221005,29,Michael Wilson,michael.wilson@gm...,64,1
trx-112384316514,2.0,6,0.1,Credit Card,215.94599427938465,20230121,10,Michael Wilson,michael.wilson@gm...,64,2
trx-766570299689,5.0,6,0.05,Cash,227.9429971396923,20230121,16,Michael Wilson,michael.wilson@gm...,64,3


In [99]:
sales_agent_dim_df=offline_merged_df.select(
    col("sales_agent_id"),
    col("sales_agent_hire_date"),
    col("sales_agent_name")).dropDuplicates(["sales_agent_id"])

sales_agent_dim_df.show()

+--------------+---------------------+------------------+
|sales_agent_id|sales_agent_hire_date|  sales_agent_name|
+--------------+---------------------+------------------+
|           1.0|           2020-06-10|          John Doe|
|           9.0|           2019-07-19|   Daniel Martinez|
|          10.0|           2019-11-10|      Sophia Moore|
|           5.0|           2020-06-23|      David Wilson|
|           6.0|           2018-08-09|       Emma Taylor|
|           4.0|           2018-11-12|       Emily Brown|
|           7.0|           2018-07-05|Christopher Miller|
|          11.0|           2018-07-10|         john wick|
|           2.0|           2021-06-08|        Jane Smith|
|           8.0|           2019-12-08|      Olivia Davis|
|           3.0|           2019-07-22|   Michael Johnson|
+--------------+---------------------+------------------+



In [100]:
# Define a Window specification
window_spec = Window.orderBy("sales_agent_id")

# Add a product_key column using row_number
sales_agent_dim_df = sales_agent_dim_df.withColumn("sales_agent_key", row_number().over(window_spec))

sales_agent_dim_df.coalesce(1)

sales_agent_id,sales_agent_hire_date,sales_agent_name,sales_agent_key
1.0,2020-06-10,John Doe,1
10.0,2019-11-10,Sophia Moore,2
11.0,2018-07-10,john wick,3
2.0,2021-06-08,Jane Smith,4
3.0,2019-07-22,Michael Johnson,5
4.0,2018-11-12,Emily Brown,6
5.0,2020-06-23,David Wilson,7
6.0,2018-08-09,Emma Taylor,8
7.0,2018-07-05,Christopher Miller,9
8.0,2019-12-08,Olivia Davis,10


In [101]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/sales_agent_dim.csv"
sales_agent_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [102]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(sales_agent_dim_df, offline_fact_df["sales_agent_id"] == sales_agent_dim_df["sales_agent_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key,sales_agent_id.1,sales_agent_hire_date,sales_agent_name,sales_agent_key
trx-448099421943,1.0,5,0.15,Credit Card,339.9575095355511,20230417,22,Olivia Taylor,olivia.taylor@out...,59,2,1.0,2020-06-10,John Doe,1
trx-406663335952,1.0,4,0.0,Credit Card,519.96,20230421,9,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1
trx-813287633702,1.0,4,0.0,Cash,799.96,20230428,26,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-813287633702,1.0,4,0.0,Cash,799.96,20230428,26,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-228542307923,1.0,7,0.0,Credit Card,209.93,20230607,15,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1
trx-903716220298,1.0,4,0.2,Credit Card,415.9680061984063,20230310,9,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-440880859229,1.0,7,0.0,Credit Card,279.93,20230721,16,Olivia Taylor,olivia.taylor@out...,59,2,1.0,2020-06-10,John Doe,1
trx-461132132500,1.0,1,0.05,Credit Card,47.49049940407277,20220321,24,Olivia Taylor,olivia.taylor@out...,59,4,1.0,2020-06-10,John Doe,1
trx-037068981368,1.0,3,0.0,Cash,2999.9700000000003,20220321,1,Olivia Taylor,olivia.taylor@out...,59,3,1.0,2020-06-10,John Doe,1
trx-382553805112,1.0,4,0.05,Cash,303.9619961857795,20220723,8,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1


In [103]:
offline_fact_df=offline_fact_df.drop("sales_agent_id","sales_agent_name","sales_agent_hire_date")
offline_fact_df.coalesce(1)

transaction_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key,sales_agent_key
trx-444160413883,3,0.15,Cash,50.97450142979622,20221007,28,Ava Jones,ava.jones@yahoo.com,64,4,11
trx-165719496429,8,0.0,Credit Card,799.92,20221007,4,Ava Jones,ava.jones@yahoo.com,64,6,2
trx-214614885507,3,0.0,Credit Card,389.97,20221007,9,Ava Jones,ava.jones@yahoo.com,64,3,8
trx-207212358314,7,0.0,Credit Card,1049.93,20221007,13,Ava Jones,ava.jones@yahoo.com,64,5,7
trx-792804914907,4,0.2,Cash,639.9680095362663,20221007,26,Ava Jones,ava.jones@yahoo.com,64,2,7
trx-272573559115,3,0.1,Cash,269.97299284815784,20221007,4,Ava Jones,ava.jones@yahoo.com,64,1,8
trx-998492206833,9,0.0,Cash,269.91,20221007,19,Ava Jones,ava.jones@yahoo.com,64,4,10
trx-459173050315,6,0.0,Cash,1799.94,20221007,12,Ava Jones,ava.jones@yahoo.com,64,2,6
trx-375078371845,10,0.0,Credit Card,399.9,20221007,16,Ava Jones,ava.jones@yahoo.com,64,5,7
trx-836483831918,6,0.0,Credit Card,899.94,20230417,13,Ava Jones,ava.jones@yahoo.com,64,6,11


In [104]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/offline_transactions_fact.csv"
offline_fact_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [105]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/online_transactions_fact.csv"
online_fact_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [106]:
online_fact_df.count()

52500

In [107]:
offline_fact_df.count()

505000

In [108]:
spark.stop()