In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark.sql.window import Window
from itertools import chain 
from typing import Dict
from pyspark.sql.functions import col, lit, when, coalesce, split, concat, udf, concat_ws, row_number, \
                                    regexp_replace, lower, monotonically_increasing_id, regexp_extract, create_map
import re 
from datetime import datetime 
import os

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from datetime import datetime

# Configuration
class Config:
    RAW_BASE_PATH = "/user/itversity/q-company_raw_layer"
    STANDARDIZED_BASE_PATH = "/user/itversity/q-company_standardized_layer"
    CONFORMED_NORMALIZED_BASE_PATH = "/user/itversity/q-company_conformed_layer/normalized_model"
    CONFORMED_DENORMALIZED_BASE_PATH = "/user/itversity/q-company_conformed_layer/denormalized_model"
    
class Schemas:
    sales_transcation_schema = StructType([
            StructField("transaction_date", DateType(), nullable=False),
            StructField("transaction_id", StringType(), nullable=False),
            StructField("customer_id", LongType(), nullable=False),
            StructField("customer_fname", StringType(), nullable=False),
            StructField("customer_lname", StringType(), nullable=False),
            StructField("customer_email", StringType(), nullable=False),
            StructField("sales_agent_id", LongType(), nullable=True),
            StructField("branch_id", LongType(), nullable=True),
            StructField("product_id", LongType(), nullable=False),
            StructField("product_name", StringType(), nullable=False),
            StructField("product_category", StringType(), nullable=False),
            StructField("offer_1", StringType(), nullable=True),
            StructField("offer_2", StringType(), nullable=True),
            StructField("offer_3", StringType(), nullable=True),
            StructField("offer_4", StringType(), nullable=True),
            StructField("offer_5", StringType(), nullable=True),
            StructField("units", LongType(), nullable=False),
            StructField("unit_price", DoubleType(), nullable=False),
            StructField("is_online", StringType(), nullable=False),
            StructField("payment_method", StringType(), nullable=False),
            StructField("shipping_address", StringType(), nullable=True),
            StructField("name", StringType(), nullable=True),
            StructField("hire_date", DateType(), nullable=True),
            StructField("location", StringType(), nullable=True),
            StructField("establish_date", DateType(), nullable=True),
            StructField("class", StringType(), nullable=True),
            StructField("group", StringType(), nullable=False)
        ])

    offline_sales_transcation_schema = StructType([
            StructField("transaction_date", DateType(), nullable=False),
            StructField("transaction_id", StringType(), nullable=False),
            StructField("customer_id", LongType(), nullable=False),
            StructField("customer_name", StringType(), nullable=False),
            StructField("customer_email", StringType(), nullable=False),
            StructField("sales_agent_id", LongType(), nullable=False),
            StructField("branch_id", LongType(), nullable=False),
            StructField("product_id", LongType(), nullable=False),
            StructField("product_name", StringType(), nullable=False),
            StructField("product_category", StringType(), nullable=False),
            StructField("units", IntegerType(), nullable=False),
            StructField("unit_price", DoubleType(), nullable=False),
            StructField("discount", FloatType(), nullable=False),
            StructField("total_price", DoubleType(), nullable=False),
            StructField("payment_method", StringType(), nullable=False),
            StructField("sales_agent_name", StringType(), nullable=False),
            StructField("sales_agent_hire_date", DateType(), nullable=False),
            StructField("branch_location", StringType(), nullable=False),
            StructField("branch_establish_date", DateType(), nullable=False),
            StructField("branch_class", StringType(), nullable=False),
            StructField("group", StringType(), nullable=False)
        ])

    online_sales_transcation_schema = StructType([
        StructField("transaction_date", DateType(), nullable=False),
        StructField("transaction_id", StringType(), nullable=False),
        StructField("customer_id", LongType(), nullable=False),
        StructField("customer_name", StringType(), nullable=False),
        StructField("customer_email", StringType(), nullable=False),
        StructField("product_id", LongType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("product_category", StringType(), nullable=False),
        StructField("units", IntegerType(), nullable=False),
        StructField("unit_price", DoubleType(), nullable=False),
        StructField("discount", FloatType(), nullable=False),
        StructField("total_price", DoubleType(), nullable=False),
        StructField("payment_method", StringType(), nullable=False),
        StructField("shipping_street_name",  StringType(), nullable=False), 
        StructField("shipping_city",  StringType(), nullable=False),
        StructField("shipping_state",  StringType(), nullable=False),
        StructField("shipping_zip_code",  StringType(), nullable=False),
        StructField("group", StringType(), nullable=False)
    ])
    
# Utility functions
class HDFSUtils:
    @staticmethod
    def get_latest_file(spark: SparkSession, hdfs_path: str) -> str:
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
            .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        sorted_files = sorted(files, key=lambda f: f.getModificationTime(), reverse=True)
        return sorted_files[0].getPath().toString() if sorted_files else None
    

In [3]:
spark = SparkSession\
    .builder\
    .appName("DataTransformedLayer")\
    .getOrCreate()


In [4]:
def align_schemas(df1, df2):
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    # Columns present in df1 but not in df2
    for col_name in columns1 - columns2:
        df2 = df2.withColumn(col_name, lit(None))
    
    # Columns present in df2 but not in df1
    for col_name in columns2 - columns1:
        df1 = df1.withColumn(col_name, lit(None))
    
    # Ensure both DataFrames have the same column order
    all_columns = sorted(list(columns1.union(columns2)))
    df1 = df1.select(all_columns)
    df2 = df2.select(all_columns)
    
    return df1, df2


In [5]:
def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    print(schema)
    
    df = spark.read.schema(schema).parquet(path)
    
    return df

In [8]:
def process_denormalized_model(spark: SparkSession) -> None:
    current_day = datetime.now().strftime("%Y-%m-%d")
    
    hdfs_path = Config.STANDARDIZED_BASE_PATH
    
    input_base_path = hdfs_path + '/standardized_sales_transaction_' + current_day
        
    online_df = read_parquet_with_schema(spark, f"{input_base_path}/online_transactions*")
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    
    offline_df = read_parquet_with_schema(spark, f"{input_base_path}/offline_transactions*")
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    online_df_a, offline_df_a = align_schemas(offline_df_a, online_df_a)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
            'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
            'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
            'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
            'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
            'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
        ]
        

    return online_df, offline_df, all_df.select(new_order)


In [9]:
online_df, offline_df, all_df = process_denormalized_model(spark)

StructType(List(StructField(transaction_date,DateType,true),StructField(transaction_id,StringType,true),StructField(customer_id,LongType,true),StructField(customer_name,StringType,true),StructField(customer_email,StringType,true),StructField(product_id,LongType,true),StructField(product_name,StringType,true),StructField(product_category,StringType,true),StructField(units,LongType,true),StructField(unit_price,DoubleType,true),StructField(discount,FloatType,true),StructField(total_price,DoubleType,true),StructField(payment_method,StringType,true),StructField(shipping_street_name,StringType,true),StructField(shipping_city,StringType,true),StructField(shipping_state,StringType,true),StructField(shipping_zip_code,StringType,true),StructField(group,StringType,true)))
StructType(List(StructField(transaction_date,DateType,true),StructField(transaction_id,StringType,true),StructField(customer_id,LongType,true),StructField(customer_name,StringType,true),StructField(customer_email,StringType,true

In [24]:
def denorm_modeling(df: DataFrame, transaction_type: str) -> None:
    spark = SparkSession.builder.getOrCreate()
    
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    file_path = None
    
    if transaction_type == 'online':
        file_path = denorm_path + "/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        file_path = denorm_path + "/offline_fact_table/offline_merged"
    else:
        file_path = denorm_path + "/all_sales_fact_table/sales_merged"
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)

    schema = df.schema
    
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        
        existing_df.cache()
        
        merged_df = existing_df.unionByName(df).dropDuplicates()
        
        existing_df.unpersist()
    else:
        merged_df = df
    
    merged_df.write.option("schema", schema.json()).mode("overwrite").parquet(file_path)

    print(f"Appended {df.count()} rows to {file_path}")


In [23]:
denorm_modeling(online_df, 'online')

Appended 500 rows to /user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged


In [47]:
denorm_modeling(offline_df, 'offline')

Appended 1000 rows to /user/itversity/q-company_conformed_layer/denormalized_model/offline_fact_table/offline_merged


In [48]:
denorm_modeling(all_df, 'all')

Appended 1500 rows to /user/itversity/q-company_conformed_layer/denormalized_model/all_sales_fact_table/sales_merged


In [25]:
path = Config.CONFORMED_DENORMALIZED_BASE_PATH + "/online_fact_table/online_merged"

online = spark.read.parquet(path)

In [27]:
online2 = spark.read.parquet(path)

In [32]:
from pyspark.sql.functions import col, dayofmonth, dayofweek, dayofyear, month, year, weekofyear, quarter, regexp_replace, date_format
from datetime import datetime, timedelta

# Generate Date Range (Example: from 2023-01-01 to 2023-01-05)
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 12, 31)
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

# Create DataFrame
date_df = spark.createDataFrame([(date,) for date in date_list], ["date"])

# Convert date column to match transaction_date format (yyyy-MM-dd)
date_df = date_df.withColumn("date", date_format(col("date"), "yyyy-MM-dd"))

# Add Surrogate Key (yyyyMMdd format)
date_df = date_df.withColumn("date_key", 
                             regexp_replace(date_format(col("date"), "yyyyMMdd"), "-", "").cast("long"))

# Extract Date Components
date_df = date_df.withColumn("day", dayofmonth(col("date"))) \
    .withColumn("day_of_week", dayofweek(col("date"))) \
    .withColumn("day_of_year", dayofyear(col("date"))) \
    .withColumn("week_of_year", weekofyear(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("quarter", quarter(col("date"))) \
    .withColumn("year", year(col("date"))) \
    .withColumn("is_weekend", (col("day_of_week") >= 6).cast("integer")) \
    .withColumn("month_name", date_format(col("date"), "MMMM")) \
    .withColumn("day_name", date_format(col("date"), "EEEE")) \
    .withColumn("year_month", date_format(col("date"), "yyyyMM"))

output_path = Config.CONFORMED_NORMALIZED_BASE_PATH + "/date_dim/" + "date_dim_table"
date_df.write.mode('overwrite').parquet(output_path)


In [42]:
offline_fact_df = offline_df.select(
    col("transaction_id"),
    col("transaction_date"),
    col("customer_id"),
    col("sales_agent_id"),
    col("branch_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("total_price"))

In [43]:

# Convert transaction_date in online_fact_df to match the format yyyy-MM-dd
offline_fact_df = offline_fact_df.withColumn("transaction_date", date_format(col("transaction_date"), "yyyy-MM-dd"))

# Perform an inner join on transaction_date and date
joined_df = offline_fact_df.join(date_df, offline_fact_df["transaction_date"] == date_df["date"], "inner")

joined_df.show(5, False, True)


-RECORD 0----------------------------
 transaction_id   | trx-930670956039 
 transaction_date | 2022-01-01       
 customer_id      | 85462            
 sales_agent_id   | 6                
 branch_id        | 2                
 product_id       | 3                
 units            | 7                
 unit_price       | 299.99           
 discount         | 0.0              
 payment_method   | Credit Card      
 total_price      | 2099.93          
 date             | 2022-01-01       
 date_key         | 20220101         
 day              | 1                
 day_of_week      | 7                
 day_of_year      | 1                
 week_of_year     | 52               
 month            | 1                
 quarter          | 1                
 year             | 2022             
 is_weekend       | 1                
 month_name       | January          
 day_name         | Saturday         
 year_month       | 202201           
-RECORD 1----------------------------
 transaction

In [44]:
offline_fact_df=joined_df.select(
    col("transaction_id"),
    col("customer_id"),
    col("sales_agent_id"),
    col("branch_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("total_price"),
    col("date_key")
)

offline_fact_df.columns

['transaction_id',
 'customer_id',
 'sales_agent_id',
 'branch_id',
 'product_id',
 'units',
 'unit_price',
 'discount',
 'payment_method',
 'total_price',
 'date_key']

In [46]:
online_fact_df=online_df.select(
    col("transaction_date"),
    col("transaction_id"),
    col("customer_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("group"),
    col("shipping_street_name"),
    col("shipping_city"),
    col("shipping_state"),
    col("shipping_zip_code"),
    col("total_price")
)

online_fact_df.show(2, False, True)

-RECORD 0-----------------------------------------------
 transaction_date     | 2022-06-07                      
 transaction_id       | trx-620490661140                
 customer_id          | 85489                           
 product_id           | 25                              
 units                | 2                               
 unit_price           | 499.99                          
 discount             | 0.0                             
 payment_method       | Stripe                          
 group                | group1                          
 shipping_street_name | 3373 Runnymede Place Northwest  
 shipping_city        | Washington                      
 shipping_state       | District of Columbia            
 shipping_zip_code    | 20015                           
 total_price          | 999.98                          
-RECORD 1-----------------------------------------------
 transaction_date     | 2022-02-04                      
 transaction_id       | trx-895

In [76]:
# Convert transaction_date in online_fact_df to match the format yyyy-MM-dd
online_fact_df = online_fact_df.withColumn("transaction_date", date_format(col("transaction_date"), "yyyy-MM-dd"))

# Perform an inner join on transaction_date and date
joined_df_2 = online_fact_df.join(date_df, online_fact_df["transaction_date"] == date_df["date"], "inner")

joined_df_2.coalesce(1)



transaction_date,transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,shipping_street_name,shipping_city,shipping_state,shipping_zip_code,total_price,date,date_key,day,day_of_week,day_of_year,week_of_year,month,quarter,year,is_weekend,month_name,day_name,year_month
2022-01-01,trx-038966531476,85511,11,9,899.99,0.0,PayPal,2,1783 Blakely Road,Colchester,Vermont,5446,8099.91,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-196650129641,85502,8,1,79.99,0.0,Stripe,2,88 Shute Street,Everett,Massachusetts,2149,79.99,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-961186738199,85540,9,8,129.99,0.2,PayPal,3,45 Parsons Avenue,Saint Albans City,Vermont,5478,831.9360123968125,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-961186738199,85540,9,8,129.99,0.2,PayPal,1,45 Parsons Avenue,Saint Albans City,Vermont,5478,831.9360123968125,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-807433736285,85505,24,9,49.99,0.2,Stripe,4,6460 Vermont 113,Vershire,Vermont,5079,359.9280053633452,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-378573229506,85552,7,1,59.99,0.05,Credit Card,6,1265 Xavier Avenue,Hayward,California,94545,56.99049928486348,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-812311767786,85525,11,8,899.99,0.2,Stripe,6,129 Glen Street,West Brattleboro,Vermont,5301,5759.936085829735,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-714156427993,85519,5,4,19.99,0.0,PayPal,6,52 Linnmore Drive,Manchester,Connecticut,6040,79.96,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-926752978689,85553,15,10,29.99,0.0,PayPal,6,1313 North Road,Hinesburg,Vermont,5461,299.9,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201
2022-01-01,trx-377988149376,85548,30,6,24.99,0.0,Stripe,6,3328 Sunny Meadow...,Birmingham,Alabama,35242,149.94,2022-01-01,20220101,1,7,1,52,1,1,2022,1,January,Saturday,202201


In [77]:
joined_df_2.columns

['transaction_date',
 'transaction_id',
 'customer_id',
 'product_id',
 'units',
 'unit_price',
 'discount',
 'payment_method',
 'group',
 'shipping_street_name',
 'shipping_city',
 'shipping_state',
 'shipping_zip_code',
 'total_price',
 'date',
 'date_key',
 'day',
 'day_of_week',
 'day_of_year',
 'week_of_year',
 'month',
 'quarter',
 'year',
 'is_weekend',
 'month_name',
 'day_name',
 'year_month']

In [78]:
online_fact_df=joined_df_2.select(
    col("transaction_id"),
    col("customer_id"),
    col("product_id"),
    col("units"),
    col("unit_price"),
    col("discount"),
    col("payment_method"),
    col("group"),
    col("total_price"),
    col("date_key"),
    col("shipping_zip_code"),
    col("shipping_state"),
    col("shipping_city"),
    col("shipping_street_name"))

online_fact_df.coalesce(1)

transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name
trx-038966531476,85511,11,9,899.99,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road
trx-196650129641,85502,8,1,79.99,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street
trx-961186738199,85540,9,8,129.99,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue
trx-961186738199,85540,9,8,129.99,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue
trx-807433736285,85505,24,9,49.99,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113
trx-378573229506,85552,7,1,59.99,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue
trx-812311767786,85525,11,8,899.99,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street
trx-714156427993,85519,5,4,19.99,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive
trx-926752978689,85553,15,10,29.99,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road
trx-377988149376,85548,30,6,24.99,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...


In [79]:
product_dim_df=all_groups_merged_df.select(
    col("product_id"),
    col("product_name"),
    col("product_category"),
    col("unit_price")).distinct()

product_dim_df.show()

+----------+-----------------+----------------+----------+
|product_id|     product_name|product_category|unit_price|
+----------+-----------------+----------------+----------+
|        29|Hair Straightener|      Appliances|     39.99|
|        19|          Sandals|        Footwear|     29.99|
|        12|          Monitor|     Electronics|    299.99|
|        17|           Blouse|        Clothing|     29.99|
|        18|            Boots|        Footwear|    149.99|
|        15|           Hoodie|        Clothing|     29.99|
|        10|          Sandals|        Footwear|     39.99|
|         1|           Laptop|     Electronics|    999.99|
|         6|            Jeans|        Clothing|     49.99|
|        16|            Skirt|        Clothing|     39.99|
|         7|            Dress|        Clothing|     59.99|
|        28|       Hair Dryer|      Appliances|     19.99|
|         3|           Tablet|     Electronics|    299.99|
|        14|           Camera|     Electronics|    399.9

In [80]:
# Define a Window specification
window_spec = Window.orderBy("product_id")

# Add a product_key column using row_number
product_dim_df = product_dim_df.withColumn("product_key", row_number().over(window_spec))

product_dim_df.coalesce(1)

product_id,product_name,product_category,unit_price,product_key
1,Laptop,Electronics,999.99,1
2,Smartphone,Electronics,699.99,2
3,Tablet,Electronics,299.99,3
4,Headphones,Electronics,99.99,4
5,T-Shirt,Clothing,19.99,5
6,Jeans,Clothing,49.99,6
7,Dress,Clothing,59.99,7
8,Sneakers,Footwear,79.99,8
9,Boots,Footwear,129.99,9
10,Sandals,Footwear,39.99,10


In [81]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/products_dim.csv"
product_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [82]:
# Perform an inner join on transaction_date and date
online_fact_df = online_fact_df.join(product_dim_df, online_fact_df["product_id"] == product_dim_df["product_id"], "inner")

In [83]:
online_fact_df.coalesce(1)

transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_id.1,product_name,product_category,unit_price.1,product_key
trx-549713340293,85522,26,8,199.99,0.0,Stripe,6,1599.92,20220101,21114,Maryland,Crofton,1630 Eton Way,26,Vacuum Cleaner,Appliances,199.99,26
trx-120912375499,85483,26,4,199.99,0.0,Credit Card,6,799.96,20220102,1902,Massachusetts,Lynn,172 Chestnut Street,26,Vacuum Cleaner,Appliances,199.99,26
trx-500133384271,85504,26,3,199.99,0.0,PayPal,6,599.97,20220102,1719,Massachusetts,Boxborough,302 Depot Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-741895782895,85550,26,4,199.99,0.0,Credit Card,6,799.96,20220102,5075,Vermont,Thetford,398 Gove Hill Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-830547671067,85464,26,5,199.99,0.0,Stripe,6,999.95,20220103,99611,Alaska,Kenai,51185 Helmsman St...,26,Vacuum Cleaner,Appliances,199.99,26
trx-087732089100,85534,26,5,199.99,0.15,PayPal,6,849.9575238406659,20220103,21122,Maryland,Pasadena,202 Winston Road,26,Vacuum Cleaner,Appliances,199.99,26
trx-227505126526,85504,26,10,199.99,0.15,Stripe,6,1699.9150476813318,20220103,6040,Connecticut,Manchester,89 Frances Drive,26,Vacuum Cleaner,Appliances,199.99,26
trx-512595649334,85561,26,8,199.99,0.0,Credit Card,6,1599.92,20220103,85304,Arizona,Glendale,13066 North 56th ...,26,Vacuum Cleaner,Appliances,199.99,26
trx-950565917451,85512,26,5,199.99,0.0,Stripe,6,999.95,20220103,21060,Maryland,Glen Burnie,140 William Chamb...,26,Vacuum Cleaner,Appliances,199.99,26
trx-361642366546,85526,26,5,199.99,0.2,Credit Card,6,799.9600119203329,20220104,36116,Alabama,Montgomery,4560 Hurlston Drive,26,Vacuum Cleaner,Appliances,199.99,26


In [84]:
online_fact_df=online_fact_df.drop("product_id", "product_name", "product_category", "unit_price")
online_fact_df.coalesce(1)

transaction_id,customer_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key
trx-038966531476,85511,9,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road,11
trx-196650129641,85502,1,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street,8
trx-961186738199,85540,8,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9
trx-961186738199,85540,8,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9
trx-807433736285,85505,9,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113,24
trx-378573229506,85552,1,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue,7
trx-812311767786,85525,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11
trx-714156427993,85519,4,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive,5
trx-926752978689,85553,10,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road,15
trx-377988149376,85548,6,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...,30


In [85]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(product_dim_df, offline_fact_df["product_id"] == product_dim_df["product_id"], "inner")


In [86]:
offline_fact_df=offline_fact_df.drop("product_id", "product_name", "product_category", "unit_price")
offline_fact_df.coalesce(1)

transaction_id,customer_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key
trx-407964138068,85479,4.0,5.0,8,0.1,Cash,3599.927904634476,20221005,25
trx-843694845999,85506,11.0,1.0,10,0.05,Credit Card,284.9049964249134,20221005,15
trx-548311587674,85541,5.0,3.0,10,0.05,Cash,284.9049964249134,20221005,19
trx-611821275884,85470,7.0,5.0,1,0.0,Credit Card,499.99,20221005,25
trx-527486306638,85544,10.0,2.0,5,0.1,Cash,179.95499523282052,20221005,23
trx-012682136132,85480,1.0,4.0,4,0.1,Cash,71.96399809360504,20221005,5
trx-167057015608,85519,8.0,2.0,10,0.0,Cash,599.9,20221005,7
trx-655869782401,85527,8.0,2.0,10,0.2,Cash,159.9200023829937,20221005,28
trx-725477728975,85462,4.0,6.0,6,0.0,Cash,899.94,20221005,18
trx-079756546639,85531,8.0,2.0,5,0.0,Credit Card,199.95,20221005,16


In [87]:
customer_dim_df=all_groups_merged_df.select(
    col("customer_id"),
    col("customer_name"),
    col("cleaned_email")).dropDuplicates(["customer_id"])

customer_dim_df.show()

+-----------+------------------+--------------------+
|customer_id|     customer_name|       cleaned_email|
+-----------+------------------+--------------------+
|      85525|         Ava Jones| ava.jones@yahoo.com|
|      85527|       Emma Miller|emma.miller@outlo...|
|      85520|  Alexander Wilson|alexander.wilson@...|
|      85541|         Ava Jones|ava.jones@hotmail...|
|      85473|     Sophia Miller|sophia.miller@yah...|
|      85514|     Michael Smith|michael.smith@out...|
|      85547|      James Wilson|james.wilson@outl...|
|      85508|      James Miller|james.miller@hotm...|
|      85542|      James Miller|james.miller@yaho...|
|      85506|      Mia Williams|mia.williams@hotm...|
|      85515|    William Wilson|william.wilson@ou...|
|      85511|   Alexander Moore|alexander.moore@y...|
|      85477|Alexander Williams|alexander.william...|
|      85554|        John Brown|john.brown@gmail.com|
|      85490|        Mia Taylor|mia.taylor@yahoo.com|
|      85516|  Michael Willi

In [88]:
# Define a Window specification
window_spec = Window.orderBy("customer_id")

# Add a product_key column using row_number
customer_dim_df = customer_dim_df.withColumn("customer_key", row_number().over(window_spec))

customer_dim_df.coalesce(1)

customer_id,customer_name,cleaned_email,customer_key
85462,Olivia Brown,olivia.brown@yaho...,1
85463,Ava Miller,ava.miller@gmail.com,2
85464,Alexander Moore,alexander.moore@o...,3
85465,James Taylor,james.taylor@gmai...,4
85466,Michael Brown,michael.brown@yah...,5
85467,Alexander Jones,alexander.jones@o...,6
85468,William Davis,william.davis@yah...,7
85469,Emma Miller,emma.miller@outlo...,8
85470,William Williams,william.williams@...,9
85471,Ava Williams,ava.williams@outl...,10


In [89]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/customer_dim.csv"
customer_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [90]:
# Perform an inner join on transaction_date and date
online_fact_df = online_fact_df.join(customer_dim_df, online_fact_df["customer_id"] == customer_dim_df["customer_id"], "inner")
online_fact_df.coalesce(1)

transaction_id,customer_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key,customer_id.1,customer_name,cleaned_email,customer_key
trx-812311767786,85525,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11,85525,Michael Wilson,michael.wilson@gm...,64
trx-777696509076,85525,10,0.15,PayPal,5,509.9150143027305,20220103,6040,Connecticut,Manchester,81 Lyness Street,7,85525,Michael Wilson,michael.wilson@gm...,64
trx-961752173520,85525,5,0.15,PayPal,6,254.9575071513653,20220103,6040,Connecticut,Manchester,117 Adelaide Road,7,85525,Michael Wilson,michael.wilson@gm...,64
trx-752459780179,85525,9,0.05,Credit Card,6,1111.4144860535862,20220103,85310,Arizona,Glendale,23124 North 71st ...,21,85525,Michael Wilson,michael.wilson@gm...,64
trx-895691945634,85525,5,0.2,Credit Card,6,159.96000238358977,20220104,5905,Vermont,Maidstone,3541 North Road,23,85525,Michael Wilson,michael.wilson@gm...,64
trx-985459269846,85525,8,0.05,Credit Card,6,1139.923985695839,20220106,2143,Massachusetts,Somerville,13 Warren Avenue,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-667598624256,85525,10,0.0,Stripe,6,1299.9,20220106,37209,Tennessee,Nashville,410 51st Avenue N...,21,85525,Michael Wilson,michael.wilson@gm...,64
trx-566935476646,85525,9,0.1,Credit Card,6,404.9189892733097,20220106,6040,Connecticut,Manchester,46 Lilac Street,24,85525,Michael Wilson,michael.wilson@gm...,64
trx-574027283390,85525,1,0.15,PayPal,6,84.99150238394736,20220107,20001,District of Columbia,Washington,81 Seaton Place N...,4,85525,Michael Wilson,michael.wilson@gm...,64
trx-037078085183,85525,8,0.0,PayPal,6,1199.92,20220107,80003,Colorado,Arvada,7912 Depew Street,13,85525,Michael Wilson,michael.wilson@gm...,64


In [91]:
online_fact_df=online_fact_df.drop("customer_id","cleaned_email","customer_name")
online_fact_df.coalesce(1)

transaction_id,units,discount,payment_method,group,total_price,date_key,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,product_key,customer_key
trx-038966531476,9,0.0,PayPal,2,8099.91,20220101,5446,Vermont,Colchester,1783 Blakely Road,11,50
trx-196650129641,1,0.0,Stripe,2,79.99,20220101,2149,Massachusetts,Everett,88 Shute Street,8,41
trx-961186738199,8,0.2,PayPal,3,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9,79
trx-961186738199,8,0.2,PayPal,1,831.9360123968125,20220101,5478,Vermont,Saint Albans City,45 Parsons Avenue,9,79
trx-807433736285,9,0.2,Stripe,4,359.9280053633452,20220101,5079,Vermont,Vershire,6460 Vermont 113,24,44
trx-378573229506,1,0.05,Credit Card,6,56.99049928486348,20220101,94545,California,Hayward,1265 Xavier Avenue,7,91
trx-812311767786,8,0.2,Stripe,6,5759.936085829735,20220101,5301,Vermont,West Brattleboro,129 Glen Street,11,64
trx-714156427993,4,0.0,PayPal,6,79.96,20220101,6040,Connecticut,Manchester,52 Linnmore Drive,5,58
trx-926752978689,10,0.0,PayPal,6,299.9,20220101,5461,Vermont,Hinesburg,1313 North Road,15,92
trx-377988149376,6,0.0,Stripe,6,149.94,20220101,35242,Alabama,Birmingham,3328 Sunny Meadow...,30,87


In [92]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(customer_dim_df, offline_fact_df["customer_id"] == customer_dim_df["customer_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,customer_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_id.1,customer_name,cleaned_email,customer_key
trx-170004611332,85525,7.0,5.0,10,0.05,Cash,237.40499702095985,20220527,30,85525,Michael Wilson,michael.wilson@gm...,64
trx-844036209605,85525,10.0,4.0,7,0.0,Credit Card,1049.93,20220527,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-109416203740,85525,11.0,6.0,1,0.0,Credit Card,39.99,20220527,23,85525,Michael Wilson,michael.wilson@gm...,64
trx-106504733120,85525,10.0,6.0,7,0.0,Credit Card,4899.93,20220527,2,85525,Michael Wilson,michael.wilson@gm...,64
trx-745057689089,85525,9.0,6.0,5,0.2,Credit Card,1199.9600178807975,20220527,12,85525,Michael Wilson,michael.wilson@gm...,64
trx-714247080040,85525,7.0,4.0,8,0.0,Cash,639.92,20220718,8,85525,Michael Wilson,michael.wilson@gm...,64
trx-115302227341,85525,9.0,6.0,8,0.0,Cash,7999.92,20220718,1,85525,Michael Wilson,michael.wilson@gm...,64
trx-991057034635,85525,5.0,4.0,7,0.1,Cash,503.9369866502285,20220718,8,85525,Michael Wilson,michael.wilson@gm...,64
trx-815328334349,85525,10.0,5.0,7,0.0,Credit Card,1049.93,20220718,18,85525,Michael Wilson,michael.wilson@gm...,64
trx-197754917684,85525,1.0,3.0,9,0.0,Credit Card,359.91,20220718,10,85525,Michael Wilson,michael.wilson@gm...,64


In [93]:
offline_fact_df=offline_fact_df.drop( "customer_id")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key
trx-823890000599,8.0,6.0,2,0.0,Credit Card,1999.98,20220312,1,Michael Wilson,michael.wilson@gm...,64
trx-398777947283,9.0,4.0,6,0.0,Credit Card,179.94,20220312,27,Michael Wilson,michael.wilson@gm...,64
trx-989278595733,9.0,6.0,6,0.0,Cash,779.94,20220312,21,Michael Wilson,michael.wilson@gm...,64
trx-273180803756,3.0,1.0,1,0.2,Credit Card,63.99200095355511,20220312,22,Michael Wilson,michael.wilson@gm...,64
trx-717300843037,5.0,2.0,1,0.2,Credit Card,799.9920119208098,20220312,1,Michael Wilson,michael.wilson@gm...,64
trx-864607890708,8.0,2.0,10,0.2,Credit Card,7199.920107287168,20220325,11,Michael Wilson,michael.wilson@gm...,64
trx-834287510675,2.0,5.0,6,0.0,Credit Card,359.94,20220325,7,Michael Wilson,michael.wilson@gm...,64
trx-395878299405,4.0,3.0,6,0.0,Cash,779.94,20220325,9,Michael Wilson,michael.wilson@gm...,64
trx-505177293904,11.0,3.0,8,0.0,Credit Card,2399.92,20220325,3,Michael Wilson,michael.wilson@gm...,64
trx-249842381834,7.0,5.0,10,0.0,Credit Card,499.9,20220325,6,Michael Wilson,michael.wilson@gm...,64


In [63]:
offline_df.cache()

KeyboardInterrupt: 

In [60]:
#store branch dim

branches_dim_df=offline_df.select(
    col("branch_id"),
    col("branch_location"),
    col("branch_establish_date"),
    col("branch_class")).distinct()

branches_dim_df.show()

KeyboardInterrupt: 

In [95]:
# Define a Window specification
window_spec = Window.orderBy("branch_id")

# Add a product_key column using row_number
branches_dim_df = branches_dim_df.withColumn("branch_key", row_number().over(window_spec))

branches_dim_df.coalesce(1)

branch_id,branch_location,branch_establish_date,branch_class,branch_key
1.0,New York,2017-01-15,A,1
2.0,Los Angeles,2016-07-28,B,2
3.0,Chicago,2015-03-10,A,3
4.0,Houston,2016-11-05,D,4
5.0,Phoenix,2017-09-20,C,5
6.0,Oklahoma,2016-09-20,A,6


In [96]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/branches_dim.csv"
branches_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [97]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(branches_dim_df, offline_fact_df["branch_id"] == branches_dim_df["branch_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,branch_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_id.1,branch_location,branch_establish_date,branch_class,branch_key
trx-752036065952,5.0,1.0,2,0.0,Credit Card,199.98,20220412,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-966555221163,4.0,1.0,1,0.0,Credit Card,149.99,20221008,13,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-438034122453,8.0,1.0,9,0.15,Credit Card,382.4235107266903,20221008,24,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-364183758782,3.0,1.0,3,0.15,Credit Card,76.47450214505196,20221201,15,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-861589695084,8.0,1.0,8,0.2,Credit Card,3199.9360476827624,20220718,25,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-723526461132,8.0,1.0,8,0.0,Credit Card,159.92,20220402,28,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-128824540336,3.0,1.0,6,0.0,Credit Card,599.9399999999999,20220402,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-128824540336,3.0,1.0,6,0.0,Credit Card,599.9399999999999,20220402,4,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-389151782527,6.0,1.0,9,0.05,Cash,1709.914478543401,20220402,26,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1
trx-921810114081,5.0,1.0,6,0.0,Credit Card,179.94,20220402,27,Michael Wilson,michael.wilson@gm...,64,1.0,New York,2017-01-15,A,1


In [98]:
offline_fact_df=offline_fact_df.drop("branch_class", "branch_establish_date", "branch_location","branch_id")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key
trx-288544269912,2.0,4,0.2,Credit Card,2879.9680429148675,20221005,11,Michael Wilson,michael.wilson@gm...,64,2
trx-453952256916,5.0,2,0.0,Cash,79.98,20221005,10,Michael Wilson,michael.wilson@gm...,64,6
trx-320018303559,7.0,10,0.2,Cash,319.92000476717953,20221005,10,Michael Wilson,michael.wilson@gm...,64,4
trx-024932783521,1.0,6,0.0,Cash,2999.94,20221005,25,Michael Wilson,michael.wilson@gm...,64,3
trx-209702051200,11.0,5,0.0,Cash,1499.95,20221005,12,Michael Wilson,michael.wilson@gm...,64,1
trx-267818677599,3.0,5,0.0,Credit Card,399.95,20221005,22,Michael Wilson,michael.wilson@gm...,64,2
trx-956064519174,11.0,5,0.0,Cash,199.95,20221005,29,Michael Wilson,michael.wilson@gm...,64,3
trx-480863914129,11.0,2,0.0,Credit Card,79.98,20221005,29,Michael Wilson,michael.wilson@gm...,64,1
trx-112384316514,2.0,6,0.1,Credit Card,215.94599427938465,20230121,10,Michael Wilson,michael.wilson@gm...,64,2
trx-766570299689,5.0,6,0.05,Cash,227.9429971396923,20230121,16,Michael Wilson,michael.wilson@gm...,64,3


In [99]:
sales_agent_dim_df=offline_merged_df.select(
    col("sales_agent_id"),
    col("sales_agent_hire_date"),
    col("sales_agent_name")).dropDuplicates(["sales_agent_id"])

sales_agent_dim_df.show()

+--------------+---------------------+------------------+
|sales_agent_id|sales_agent_hire_date|  sales_agent_name|
+--------------+---------------------+------------------+
|           1.0|           2020-06-10|          John Doe|
|           9.0|           2019-07-19|   Daniel Martinez|
|          10.0|           2019-11-10|      Sophia Moore|
|           5.0|           2020-06-23|      David Wilson|
|           6.0|           2018-08-09|       Emma Taylor|
|           4.0|           2018-11-12|       Emily Brown|
|           7.0|           2018-07-05|Christopher Miller|
|          11.0|           2018-07-10|         john wick|
|           2.0|           2021-06-08|        Jane Smith|
|           8.0|           2019-12-08|      Olivia Davis|
|           3.0|           2019-07-22|   Michael Johnson|
+--------------+---------------------+------------------+



In [100]:
# Define a Window specification
window_spec = Window.orderBy("sales_agent_id")

# Add a product_key column using row_number
sales_agent_dim_df = sales_agent_dim_df.withColumn("sales_agent_key", row_number().over(window_spec))

sales_agent_dim_df.coalesce(1)

sales_agent_id,sales_agent_hire_date,sales_agent_name,sales_agent_key
1.0,2020-06-10,John Doe,1
10.0,2019-11-10,Sophia Moore,2
11.0,2018-07-10,john wick,3
2.0,2021-06-08,Jane Smith,4
3.0,2019-07-22,Michael Johnson,5
4.0,2018-11-12,Emily Brown,6
5.0,2020-06-23,David Wilson,7
6.0,2018-08-09,Emma Taylor,8
7.0,2018-07-05,Christopher Miller,9
8.0,2019-12-08,Olivia Davis,10


In [101]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/sales_agent_dim.csv"
sales_agent_dim_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [102]:
# Perform an inner join on transaction_date and date
offline_fact_df = offline_fact_df.join(sales_agent_dim_df, offline_fact_df["sales_agent_id"] == sales_agent_dim_df["sales_agent_id"], "inner")
offline_fact_df.coalesce(1)

transaction_id,sales_agent_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key,sales_agent_id.1,sales_agent_hire_date,sales_agent_name,sales_agent_key
trx-448099421943,1.0,5,0.15,Credit Card,339.9575095355511,20230417,22,Olivia Taylor,olivia.taylor@out...,59,2,1.0,2020-06-10,John Doe,1
trx-406663335952,1.0,4,0.0,Credit Card,519.96,20230421,9,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1
trx-813287633702,1.0,4,0.0,Cash,799.96,20230428,26,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-813287633702,1.0,4,0.0,Cash,799.96,20230428,26,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-228542307923,1.0,7,0.0,Credit Card,209.93,20230607,15,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1
trx-903716220298,1.0,4,0.2,Credit Card,415.9680061984063,20230310,9,Olivia Taylor,olivia.taylor@out...,59,1,1.0,2020-06-10,John Doe,1
trx-440880859229,1.0,7,0.0,Credit Card,279.93,20230721,16,Olivia Taylor,olivia.taylor@out...,59,2,1.0,2020-06-10,John Doe,1
trx-461132132500,1.0,1,0.05,Credit Card,47.49049940407277,20220321,24,Olivia Taylor,olivia.taylor@out...,59,4,1.0,2020-06-10,John Doe,1
trx-037068981368,1.0,3,0.0,Cash,2999.9700000000003,20220321,1,Olivia Taylor,olivia.taylor@out...,59,3,1.0,2020-06-10,John Doe,1
trx-382553805112,1.0,4,0.05,Cash,303.9619961857795,20220723,8,Olivia Taylor,olivia.taylor@out...,59,5,1.0,2020-06-10,John Doe,1


In [103]:
offline_fact_df=offline_fact_df.drop("sales_agent_id","sales_agent_name","sales_agent_hire_date")
offline_fact_df.coalesce(1)

transaction_id,units,discount,payment_method,total_price,date_key,product_key,customer_name,cleaned_email,customer_key,branch_key,sales_agent_key
trx-444160413883,3,0.15,Cash,50.97450142979622,20221007,28,Ava Jones,ava.jones@yahoo.com,64,4,11
trx-165719496429,8,0.0,Credit Card,799.92,20221007,4,Ava Jones,ava.jones@yahoo.com,64,6,2
trx-214614885507,3,0.0,Credit Card,389.97,20221007,9,Ava Jones,ava.jones@yahoo.com,64,3,8
trx-207212358314,7,0.0,Credit Card,1049.93,20221007,13,Ava Jones,ava.jones@yahoo.com,64,5,7
trx-792804914907,4,0.2,Cash,639.9680095362663,20221007,26,Ava Jones,ava.jones@yahoo.com,64,2,7
trx-272573559115,3,0.1,Cash,269.97299284815784,20221007,4,Ava Jones,ava.jones@yahoo.com,64,1,8
trx-998492206833,9,0.0,Cash,269.91,20221007,19,Ava Jones,ava.jones@yahoo.com,64,4,10
trx-459173050315,6,0.0,Cash,1799.94,20221007,12,Ava Jones,ava.jones@yahoo.com,64,2,6
trx-375078371845,10,0.0,Credit Card,399.9,20221007,16,Ava Jones,ava.jones@yahoo.com,64,5,7
trx-836483831918,6,0.0,Credit Card,899.94,20230417,13,Ava Jones,ava.jones@yahoo.com,64,6,11


In [104]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/offline_transactions_fact.csv"
offline_fact_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [105]:
# Write the merged DataFrame to a single CSV file
output_path = "/Graduation_Project/transformed_data/schema/online_transactions_fact.csv"
online_fact_df.coalesce(1).write.mode('overwrite').csv(output_path, header=True)

In [106]:
online_fact_df.count()

52500

In [107]:
offline_fact_df.count()

505000

In [59]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, date_format, row_number
from pyspark.sql.window import Window
from functools import reduce
from typing import List, Tuple
from utils import Config
import os

def read_parquet(spark: SparkSession, path: str) -> DataFrame:
    return spark.read.parquet(path)

def write_parquet(df: DataFrame, path: str) -> None:
    df.write.mode('overwrite').parquet(path)

def add_surrogate_key(df: DataFrame, key_name: str, order_by: str) -> DataFrame:
    window_spec = Window.orderBy(order_by)
    return df.withColumn(key_name, row_number().over(window_spec))

def process_date_dimension(spark: SparkSession) -> DataFrame:
       return read_parquet(spark, "/user/itversity/q-company_conformed_layer/normalized_model/date_dim/date_dim_table")

def process_product_dimension(df: DataFrame) -> DataFrame:
    product_dim = df.select("product_id", "product_name", "product_category", "unit_price").distinct()
    return add_surrogate_key(product_dim, "product_key", "product_id")

def process_customer_dimension(df: DataFrame) -> DataFrame:
    customer_dim = df.select("customer_id", "customer_name", "customer_email").distinct()
    return add_surrogate_key(customer_dim, "customer_key", "customer_id")

def process_branch_dimension(df: DataFrame) -> DataFrame:
    branch_dim = df.select("branch_id", "branch_location", "branch_establish_date", "branch_class").distinct()
    return add_surrogate_key(branch_dim, "branch_key", "branch_id")

def process_sales_agent_dimension(df: DataFrame) -> DataFrame:
    sales_agent_dim = df.select("sales_agent_id", "sales_agent_hire_date", "sales_agent_name").distinct()
    return add_surrogate_key(sales_agent_dim, "sales_agent_key", "sales_agent_id")

def process_online_fact(online_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame) -> DataFrame:
    online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    online_fact = online_fact.join(date_df.select('date_key', 'date'), online_fact["transaction_date"] == date_df["date"], "inner") \
                             .join(product_df.select('product_key', 'product_id'), "product_id") \
                             .join(customer_df.select('customer_key', 'customer_id'), "customer_id")
        
    # Select final columns
    return online_fact.select(
        "transaction_id", "customer_key", "product_key", "date_key", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name"
    )

def process_offline_fact(offline_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame, branch_df: DataFrame, sales_agent_df: DataFrame) -> DataFrame:
    offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    
    # Select final columns
    return offline_fact.select(
        "transaction_id", "customer_key", "sales_agent_key", "branch_key", "product_key", "date_key",
        "units", "unit_price", "discount", "payment_method", "total_price", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )

def process_dimensions(all_df: DataFrame, offline_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    date_df = process_date_dimension(spark)
    product_df = process_product_dimension(all_df)
    customer_df = process_customer_dimension(all_df)
    branch_df = process_branch_dimension(offline_df)
    sales_agent_df = process_sales_agent_dimension(offline_df)
    
    return date_df, product_df, customer_df, branch_df, sales_agent_df

def save_dimensions(dimensions: List[Tuple[str, DataFrame]]) -> None:
    for name, df in dimensions:
        write_parquet(df, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/{name}/{name}")

def main():
    spark = create_spark_session()
    
    try:
        # Read denormalized data
        online_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/online_fact_table/online_merged")
        offline_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/offline_fact_table/offline_merged")
        all_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/all_sales_fact_table/sales_merged")
        
        # Process dimensions
        date_df, product_df, customer_df, branch_df, sales_agent_df = process_dimensions(all_df, offline_df)
        
        # Save dimensions
        dimensions = [
            ("product_dim", product_df),
            ("customer_dim", customer_df),
            ("branch_dim", branch_df),
            ("sales_agent_dim", sales_agent_df)
        ]
        save_dimensions(dimensions)
        
        # Process and save fact tables
        online_fact = process_online_fact(online_df, date_df, product_df, customer_df)
        offline_fact = process_offline_fact(offline_df, date_df, product_df, customer_df, branch_df, sales_agent_df)
        
        write_parquet(online_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/online_sales_fact/online_fact")
        write_parquet(offline_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/offline_sales_fact/offline_fact")
        
        print("Normalized model processing completed successfully.")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
    main()



Normalized model processing completed successfully.


In [49]:
spark.stop()

In [56]:
def read_parquet(spark: SparkSession, path: str) -> DataFrame:
    return spark.read.parquet(path)

def write_parquet(df: DataFrame, path: str) -> None:
    df.write.mode('overwrite').parquet(path)

def add_surrogate_key(df: DataFrame, key_name: str, order_by: str) -> DataFrame:
    window_spec = Window.orderBy(order_by)
    return df.withColumn(key_name, row_number().over(window_spec))

def process_date_dimension(spark: SparkSession) -> DataFrame:
       return read_parquet(spark, "/user/itversity/q-company_conformed_layer/normalized_model/date_dim/date_dim_table")

def process_product_dimension(df: DataFrame) -> DataFrame:
    product_dim = df.select("product_id", "product_name", "product_category", "unit_price").distinct()
    return add_surrogate_key(product_dim, "product_key", "product_id")

def process_customer_dimension(df: DataFrame) -> DataFrame:
    customer_dim = df.select("customer_id", "customer_name", "customer_email").distinct()
    return add_surrogate_key(customer_dim, "customer_key", "customer_id")

def process_branch_dimension(df: DataFrame) -> DataFrame:
    branch_dim = df.select("branch_id", "branch_location", "branch_establish_date", "branch_class").distinct()
    return add_surrogate_key(branch_dim, "branch_key", "branch_id")

def process_sales_agent_dimension(df: DataFrame) -> DataFrame:
    sales_agent_dim = df.select("sales_agent_id", "sales_agent_hire_date", "sales_agent_name").distinct()
    return add_surrogate_key(sales_agent_dim, "sales_agent_key", "sales_agent_id")

def process_online_fact(online_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame) -> DataFrame:
    online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    online_fact = online_fact.join(date_df.select('date_key', 'date'), online_fact["transaction_date"] == date_df["date"], "inner") \
                             .join(product_df.select('product_key', 'product_id'), "product_id") \
                             .join(customer_df.select('customer_key', 'customer_id'), "customer_id")
        
    # Select final columns
    return online_fact.select(
        "transaction_id", "customer_key", "product_key", "date_key", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name"
    )

def process_offline_fact(offline_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame, branch_df: DataFrame, sales_agent_df: DataFrame) -> DataFrame:
    offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    
    # Select final columns
    return offline_fact.select(
        "transaction_id", "customer_key", "sales_agent_key", "branch_key", "product_key", "date_key",
        "units", "unit_price", "discount", "payment_method", "total_price", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )

def process_dimensions(all_df: DataFrame, offline_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    date_df = process_date_dimension(spark)
    product_df = process_product_dimension(all_df)
    customer_df = process_customer_dimension(all_df)
    branch_df = process_branch_dimension(offline_df)
    sales_agent_df = process_sales_agent_dimension(offline_df)
    
    return date_df, product_df, customer_df, branch_df, sales_agent_df

def save_dimensions(dimensions: List[Tuple[str, DataFrame]]) -> None:
    for name, df in dimensions:
        write_parquet(df, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/{name}/{name}")



In [19]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, date_format, row_number
from pyspark.sql.window import Window
from functools import reduce
from typing import List, Tuple
from utils import Config
import os

In [4]:

def create_spark_session() -> SparkSession:
    return SparkSession.builder \
        .appName("Normalized_Model") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()


In [6]:
spark = create_spark_session()
    

In [8]:
online_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/online_fact_table/online_merged")
offline_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/offline_fact_table/offline_merged")
all_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/all_sales_fact_table/sales_merged")

In [20]:
date_df = process_date_dimension(spark)

In [25]:
date_df, product_df, customer_df, branch_df, sales_agent_df = process_dimensions(all_df, offline_df)


In [26]:
branch_df

branch_id,branch_location,branch_establish_date,branch_class,branch_key
1,New York,2017-01-15,A,1
2,Los Angeles,2016-07-28,B,2
3,Chicago,2015-03-10,A,3
4,Houston,2016-11-05,D,4
5,Phoenix,2017-09-20,C,5


In [28]:
customer_df.count()

101

In [32]:
product_df

product_id,product_name,product_category,unit_price,product_key
1,Laptop,Electronics,999.99,1
2,Smartphone,Electronics,699.99,2
3,Tablet,Electronics,299.99,3
4,Headphones,Electronics,99.99,4
5,T-Shirt,Clothing,19.99,5
6,Jeans,Clothing,49.99,6
7,Dress,Clothing,59.99,7
8,Sneakers,Footwear,79.99,8
9,Boots,Footwear,129.99,9
10,Sandals,Footwear,39.99,10


In [36]:
dimensions = [
            ("product_dim", product_df),
            ("customer_dim", customer_df),
            ("branch_dim", branch_df),
            ("sales_agent_dim", sales_agent_df)
        ]
save_dimensions(dimensions)

In [57]:
online_fact = process_online_fact(online_df, date_df, product_df, customer_df)
offline_fact = process_offline_fact(offline_df, date_df, product_df, customer_df, branch_df, sales_agent_df)
        

In [48]:
online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    

In [40]:
online_fact

transaction_id,customer_id,product_id,units,unit_price,discount,payment_method,group,total_price,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,transaction_date
trx-566523295049,85504,2,6,699.99,0.25,Credit Card,group1,3149.955,80005,Colorado,Arvada,12039 West 85th D...,2022-02-24
trx-340311249068,85489,5,5,19.99,0.0,Stripe,group1,99.95,73135,Oklahoma,Oklahoma City,4725 Royal Oak Drive,2022-10-20
trx-132026662696,85546,6,7,49.99,0.2,Credit Card,group1,279.944,5408,Vermont,Burlington,185 Woodlawn Road,2023-07-12
trx-933247844102,85510,28,2,19.99,0.15,Stripe,group1,33.983,6040,Connecticut,Manchester,40 Highwood Drive,2023-07-12
trx-889317927482,85476,29,4,39.99,0.15,PayPal,group1,135.966,40206,Kentucky,Louisville,2721 Lindsay Avenue,2023-03-08
trx-912726017128,85560,9,5,129.99,0.0,Credit Card,group1,649.95,72701,Arkansas,Fayetteville,112 West Center S...,2022-06-09
trx-481243773278,85513,28,7,19.99,0.0,Credit Card,group1,139.93,1331,Massachusetts,Athol,7 Oliver Street,2023-04-14
trx-667583933070,85492,30,6,24.99,0.1,PayPal,group1,134.946,37013,Tennessee,Nashville,1124 Bluewillow C...,2023-04-14
trx-644749125609,85513,17,9,29.99,0.0,Credit Card,group1,269.91,6040,Connecticut,Manchester,86 Highland Street,2022-12-08
trx-269534893579,85498,25,9,499.99,0.1,PayPal,group1,4049.919,1721,Massachusetts,Ashland,9 Thomas Street,2022-07-21


In [50]:
online_fact

customer_id,product_id,transaction_id,units,unit_price,discount,payment_method,group,total_price,shipping_zip_code,shipping_state,shipping_city,shipping_street_name,transaction_date,date_key,date,product_key,customer_key
85504,2,trx-566523295049,6,699.99,0.25,Credit Card,group1,3149.955,80005,Colorado,Arvada,12039 West 85th D...,2022-02-24,20220224,2022-02-24,2,43
85489,5,trx-340311249068,5,19.99,0.0,Stripe,group1,99.95,73135,Oklahoma,Oklahoma City,4725 Royal Oak Drive,2022-10-20,20221020,2022-10-20,5,28
85546,6,trx-132026662696,7,49.99,0.2,Credit Card,group1,279.944,5408,Vermont,Burlington,185 Woodlawn Road,2023-07-12,20230712,2023-07-12,6,85
85510,28,trx-933247844102,2,19.99,0.15,Stripe,group1,33.983,6040,Connecticut,Manchester,40 Highwood Drive,2023-07-12,20230712,2023-07-12,28,49
85476,29,trx-889317927482,4,39.99,0.15,PayPal,group1,135.966,40206,Kentucky,Louisville,2721 Lindsay Avenue,2023-03-08,20230308,2023-03-08,29,15
85560,9,trx-912726017128,5,129.99,0.0,Credit Card,group1,649.95,72701,Arkansas,Fayetteville,112 West Center S...,2022-06-09,20220609,2022-06-09,9,99
85513,28,trx-481243773278,7,19.99,0.0,Credit Card,group1,139.93,1331,Massachusetts,Athol,7 Oliver Street,2023-04-14,20230414,2023-04-14,28,52
85492,30,trx-667583933070,6,24.99,0.1,PayPal,group1,134.946,37013,Tennessee,Nashville,1124 Bluewillow C...,2023-04-14,20230414,2023-04-14,30,31
85513,17,trx-644749125609,9,29.99,0.0,Credit Card,group1,269.91,6040,Connecticut,Manchester,86 Highland Street,2022-12-08,20221208,2022-12-08,17,52
85498,25,trx-269534893579,9,499.99,0.1,PayPal,group1,4049.919,1721,Massachusetts,Ashland,9 Thomas Street,2022-07-21,20220721,2022-07-21,25,37


In [53]:
offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    

In [54]:
offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    

In [58]:
write_parquet(online_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/online_sales_fact/online_fact")
write_parquet(offline_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/offline_sales_fact/offline_fact")
