In [49]:
import os
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lit, current_timestamp, col
from pyspark.sql.window import Window
from pyspark.sql.types import StructType
from utils import Config
from datetime import datetime, timedelta
from typing import Tuple

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    return  spark.read.parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    current_day = datetime.now().strftime("%Y-%m-%d")
    input_base_path = f"{Config.STANDARDIZED_BASE_PATH}/standardized_sales_transaction_{current_day}"
    
    online_df = read_parquet_with_schema(spark, f"{input_base_path}/online_transactions*")
    offline_df = read_parquet_with_schema(spark, f"{input_base_path}/offline_transactions*")
    
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    online_df_a, offline_df_a = align_schemas(offline_df_a, online_df_a)
    
    all_df = online_df_a.unionByName(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return online_df_a, offline_df_a, all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
        
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    
    print(file_path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    # Write merged data
    merged_df.write.partitionBy("transaction_date") \
                   .option("schema", merged_df.schema.json()) \
                   .mode("append") \
                   .parquet(file_path)
    
    print(f"Merged and wrote {merged_df.count()} rows to {file_path}")

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table"
    else:
        return f"{denorm_path}/all_sales_fact_table"

# def main():
#     spark = SparkSession.builder \
#         .appName("DenormalizedModelProcessing") \
#         .config("spark.sql.adaptive.enabled", "true") \
#         .config("spark.sql.shuffle.partitions", "200") \
#         .getOrCreate()
    
#     try:
#         online_df, offline_df, all_df = process_denormalized_model(spark)
        
#         denorm_modeling(spark, online_df, 'online')
#         denorm_modeling(spark, offline_df, 'offline')
#         denorm_modeling(spark, all_df, 'all')
    
#     except Exception as e:
#         print(f"An error occurred: {str(e)}")
#     finally:
#         spark.stop()

# if __name__ == "__main__":
#     main()

In [None]:
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")


In [1]:
from pyspark.sql import SparkSession, DataFrame

In [13]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession.builder \
        .appName("DenormalizedModelProcessing") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.shuffle.partitions", "200") \
        .getOrCreate()
    
    

In [2]:
df = spark.read.parquet("/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged")

AnalysisException: 'Path does not exist: hdfs://localhost:9000/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged;'

In [16]:
import os
from pyspark.sql.functions import lit
from utils import Config
from typing import Tuple
from datetime import datetime
from utils import *

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    current_day = datetime.now().strftime("%Y-%m-%d")
    input_base_path = f"{Config.STANDARDIZED_BASE_PATH}/standardized_sales_transaction_{current_day}"
    
    online_df = read_parquet_with_schema(spark, "/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/online_transactions_group6_20240712192518")
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    
    offline_df = read_parquet_with_schema(spark, f"{input_base_path}/offline_transactions*")
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    return online_df_a, offline_df_a

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    print(merged_df.printSchema())
    merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
    print(f"Appended {df.count()} rows to {file_path}")

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

def main():
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

# if __name__ == "__main__":
#     main()

In [17]:
online_df, offline_df = process_denormalized_model(spark)


Py4JJavaError: An error occurred while calling o395.parquet.
: java.lang.AssertionError: assertion failed: Conflicting directory structures detected. Suspicious paths:
	hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/offline_transactions_group6_20240712192719
	hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/offline_transactions_group1_20240712185152

If provided paths are partition directories, please set "basePath" in the options of the data source to specify the root directory of the table. If there are multiple root directories, please load them separately and then union them.
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePartitions(PartitioningUtils.scala:156)
	at org.apache.spark.sql.execution.datasources.PartitioningUtils$.parsePartitions(PartitioningUtils.scala:100)
	at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning(PartitioningAwareFileIndex.scala:131)
	at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.partitionSpec(InMemoryFileIndex.scala:71)
	at org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.partitionSchema(PartitioningAwareFileIndex.scala:50)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:158)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:387)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:242)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:230)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:667)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [21]:
online_df = spark.read.parquet("/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/online_transactions_group6_20240712192518")

In [22]:
online_df.count()

50000

In [25]:
online_df=online_df.withColumn("transaction_type", lit("online"))

In [5]:
all_sales = get_all_sales(online_df, offline_df)

In [41]:
online_df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- shipping_city: string (nullable = true)
 |-- shipping_state: string (nullable = true)
 |-- shipping_zip_code: string (nullable = true)
 |-- group: string (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- transaction_type: string (nullable = false)



In [26]:
denorm_modeling(spark, online_df, 'online')

/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged
True
root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- shipping_city: string (nullable = true)
 |-- shipping_state: string (nullable = true)
 |-- shipping_zip_code: string (nullable = true)
 |-- group: string (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- transaction_date: date (nullable = true)

None
Appended 50000 rows to /user/itversity/q-company_conf

In [12]:
spark.stop()

In [None]:
path_off_6 = "/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/offline_transactions_group6_20240712192719"
path_on_6 = "/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/online_transactions_group6_20240712192518"

In [27]:
df = spark.read.parquet("/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-12/online_transactions_group1_20240712185034")

In [28]:
df.count()

51000

In [29]:
spark.stop()

In [6]:
import os
from pyspark.sql.functions import lit
from utils import Config
from typing import Tuple
from datetime import datetime
from utils import *

def get_latest_file(spark, base_path, transaction_type):
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    files = spark.sparkContext.wholeTextFiles(f"{base_path}/standardized_sales_transaction_{current_date}/{transaction_type}_transactions*").keys().collect()
    
    
    latest_file = sorted(files, key=lambda x: x.split('_')[-1], reverse=True)[0]
    
    return latest_file

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    base_path = "/user/itversity/q-company_standardized_layer"
    
    # Get latest online and offline files
    online_file = get_latest_file(spark, base_path, "online", group_name)
    offline_file = get_latest_file(spark, base_path, "offline", group_name)
    
    if not online_file or not offline_file:
        raise ValueError(f"Could not find latest files for group {group_name}")
    
    online_df = read_parquet_with_schema(spark, online_file)
    offline_df = read_parquet_with_schema(spark, offline_file)
    
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    return online_df_a, offline_df_a

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    print(merged_df.printSchema())
    merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
    print(f"Appended {df.count()} rows to {file_path}")

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

def main():
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
     main()

/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged
False
root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- shipping_city: string (nullable = true)
 |-- shipping_state: string (nullable = true)
 |-- shipping_zip_code: string (nullable = true)
 |-- group: string (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- transaction_type: string (nullable = false)

None
Appended 500 rows to /user/itversity/q-company_conf

In [2]:
import os
from pyspark.sql.functions import lit
from utils import Config
from typing import Tuple
from datetime import datetime
from utils import *

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    current_day = datetime.now().strftime("%Y-%m-%d")
    input_base_path = f"{Config.STANDARDIZED_BASE_PATH}/standardized_sales_transaction_{current_day}"
    
    offline_df = read_parquet_with_schema(spark, "/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/offline_transactions_group5_20240713022953")
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    
    online_df = read_parquet_with_schema(spark, "/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/online_transactions_group5_20240713022835")
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    return online_df_a, offline_df_a

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    print(merged_df.printSchema())
    merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
    print(f"Appended {df.count()} rows to {file_path}")

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

In [3]:
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()


An error occurred: local variable 'online_df' referenced before assignment


In [None]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, date_format, row_number
from pyspark.sql.window import Window
from functools import reduce
from typing import List, Tuple
from utils import Config
import os


def get_latest_file(spark, base_path, transaction_type):
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    files = spark.sparkContext.wholeTextFiles(f"{base_path}/standardized_sales_transaction_{current_date}/{transaction_type}_transactions*").keys().collect()
    
    
    latest_file = sorted(files, key=lambda x: x.split('_')[-1], reverse=True)[0]
    
    return latest_file

def create_spark_session() -> SparkSession:
    return SparkSession.builder \
        .appName("Normalized_Model") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()

def read_parquet(spark: SparkSession, path: str) -> DataFrame:
    return spark.read.parquet(path)

def write_parquet(df: DataFrame, path: str) -> None:
    df.write.mode('append').parquet(path)

def add_surrogate_key(df: DataFrame, key_name: str, order_by: str) -> DataFrame:
    window_spec = Window.orderBy(order_by)
    return df.withColumn(key_name, row_number().over(window_spec))

def process_date_dimension(spark: SparkSession) -> DataFrame:
       return read_parquet(spark, "/user/itversity/q-company_conformed_layer/normalized_model/date_dim/date_dim_table")

def process_product_dimension(df: DataFrame) -> DataFrame:
    product_dim = df.select("product_id", "product_name", "product_category", "unit_price").distinct()
    return add_surrogate_key(product_dim, "product_key", "product_id")

def process_customer_dimension(df: DataFrame) -> DataFrame:
    customer_dim = df.select("customer_id", "customer_name", "customer_email").distinct()
    return add_surrogate_key(customer_dim, "customer_key", "customer_id")

def process_branch_dimension(df: DataFrame) -> DataFrame:
    branch_dim = df.select("branch_id", "branch_location", "branch_establish_date", "branch_class").distinct()
    return add_surrogate_key(branch_dim, "branch_key", "branch_id")

def process_sales_agent_dimension(df: DataFrame) -> DataFrame:
    sales_agent_dim = df.select("sales_agent_id", "sales_agent_hire_date", "sales_agent_name").distinct()
    return add_surrogate_key(sales_agent_dim, "sales_agent_key", "sales_agent_id")

def process_online_fact(online_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame) -> DataFrame:
    online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    online_fact = online_fact.join(date_df.select('date_key', 'date'), online_fact["transaction_date"] == date_df["date"], "inner") \
                             .join(product_df.select('product_key', 'product_id'), "product_id") \
                             .join(customer_df.select('customer_key', 'customer_id'), "customer_id")
        
    # Select final columns
    return online_fact.select(
        "transaction_id", "customer_key", "product_key", "date_key", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name"
    )

def process_offline_fact(offline_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame, branch_df: DataFrame, sales_agent_df: DataFrame) -> DataFrame:
    offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    # Join with dimension tables
    offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    
    # Select final columns
    return offline_fact.select(
        "transaction_id", "customer_key", "sales_agent_key", "branch_key", "product_key", "date_key",
        "units", "unit_price", "discount", "payment_method", "total_price", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )

def process_dimensions(all_df: DataFrame, offline_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    date_df = process_date_dimension(spark)
    product_df = process_product_dimension(all_df)
    customer_df = process_customer_dimension(all_df)
    branch_df = process_branch_dimension(offline_df)
    sales_agent_df = process_sales_agent_dimension(offline_df)
    
    return date_df, product_df, customer_df, branch_df, sales_agent_df

def save_dimensions(dimensions: List[Tuple[str, DataFrame]]) -> None:
    for name, df in dimensions:
        write_parquet(df, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/{name}/{name}")

def main():
    spark = create_spark_session()
    
    try:
        # Read denormalized data
        online_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/online_fact_table/online_merged")
        offline_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/offline_fact_table/offline_merged")
        all_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/all_sales_fact_table/sales_merged")
        
        # Process dimensions
        date_df, product_df, customer_df, branch_df, sales_agent_df = process_dimensions(all_df, offline_df)
        
        # Save dimensions
        dimensions = [
            ("product_dim", product_df),
            ("customer_dim", customer_df),
            ("branch_dim", branch_df),
            ("sales_agent_dim", sales_agent_df)
        ]
        save_dimensions(dimensions)
        
        # Process and save fact tables
        online_fact = process_online_fact(online_df, date_df, product_df, customer_df)
        offline_fact = process_offline_fact(offline_df, date_df, product_df, customer_df, branch_df, sales_agent_df)
        
        write_parquet(online_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/online_sales_fact/online_fact")
        write_parquet(offline_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/offline_sales_fact/offline_fact")
        
        print("Normalized model processing completed successfully.")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
    main()




In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, date_format, row_number
from pyspark.sql.window import Window
from functools import reduce
from typing import List, Tuple
from utils import Config
import os


def get_latest_file(spark, base_path, transaction_type):
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    files = spark.sparkContext.wholeTextFiles(f"{base_path}/standardized_sales_transaction_{current_date}/{transaction_type}_transactions*").keys().collect()
    
    
    latest_file = sorted(files, key=lambda x: x.split('_')[-1], reverse=True)[0]
    
    return latest_file

def create_spark_session() -> SparkSession:
    return SparkSession.builder \
        .appName("Normalized_Model") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()

def read_parquet(spark: SparkSession, path: str) -> DataFrame:
    return spark.read.parquet(path)

def write_parquet(spark, df: DataFrame, path: str, on: str) -> None:
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    file_path = spark._jvm.org.apache.hadoop.fs.Path(path)
    merged_file_exists = fs.exists(file_path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(path)
        changes_records = df.join(existing_df, on=on, how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    print(merged_df.printSchema())
    
    merged_df.write.mode('overwrite').parquet(path)
    
    print(f"Appended {df.count()} rows to {file_path}")

def add_surrogate_key(df: DataFrame, key_name: str, order_by: str) -> DataFrame:
    window_spec = Window.orderBy(order_by)
    return df.withColumn(key_name, row_number().over(window_spec))

def process_date_dimension(spark) -> DataFrame:
        return read_parquet(spark, "/user/itversity/q-company_conformed_layer/normalized_model/date_dim/date_dim_table")

def process_product_dimension(df: DataFrame) -> DataFrame:
    product_dim = df.select("product_id", "product_name", "product_category", "unit_price").distinct()
    return add_surrogate_key(product_dim, "product_key", "product_id")

def process_customer_dimension(df: DataFrame) -> DataFrame:
    customer_dim = df.select("customer_id", "customer_name", "customer_email").distinct()
    return add_surrogate_key(customer_dim, "customer_key", "customer_id")

def process_branch_dimension(df: DataFrame) -> DataFrame:
    branch_dim = df.select("branch_id", "branch_location", "branch_establish_date", "branch_class").distinct()
    return add_surrogate_key(branch_dim, "branch_key", "branch_id")

def process_sales_agent_dimension(df: DataFrame) -> DataFrame:
    sales_agent_dim = df.select("sales_agent_id", "sales_agent_hire_date", "sales_agent_name").distinct()
    return add_surrogate_key(sales_agent_dim, "sales_agent_key", "sales_agent_id")

def process_online_fact(online_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame) -> DataFrame:
    online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    online_fact = online_fact.join(date_df.select('date_key', 'date'), online_fact["transaction_date"] == date_df["date"], "inner") \
                             .join(product_df.select('product_key', 'product_id'), "product_id") \
                             .join(customer_df.select('customer_key', 'customer_id'), "customer_id")
        
    return online_fact.select(
        "transaction_id", "customer_key", "product_key", "date_key", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name"
    )

def process_offline_fact(offline_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame, branch_df: DataFrame, sales_agent_df: DataFrame) -> DataFrame:
    offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    
    return offline_fact.select(
        "transaction_id", "customer_key", "sales_agent_key", "branch_key", "product_key", "date_key",
        "units", "unit_price", "discount", "payment_method", "total_price", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )

def process_dimensions(spark: SparkSession, all_df: DataFrame, offline_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    date_df = process_date_dimension(spark)
    product_df = process_product_dimension(all_df)
    customer_df = process_customer_dimension(all_df)
    branch_df = process_branch_dimension(offline_df)
    sales_agent_df = process_sales_agent_dimension(offline_df)
    
    return date_df, product_df, customer_df, branch_df, sales_agent_df

def save_dimensions(spark, dimensions: List[Tuple[str, DataFrame]]) -> None:
    for name, df, key in dimensions:
        write_parquet(spark, df, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/{name}/{name}", key)

def main():
    spark = create_spark_session()
    
    try:
        online_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/online_fact_table/online_merged")
        offline_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/offline_fact_table/offline_merged")
        all_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/all_sales_fact_table/sales_merged")
        print("get denorm data")
        
        date_df, product_df, customer_df, branch_df, sales_agent_df = process_dimensions(spark, all_df, offline_df)
        print("process dim")
        
        dimensions = [
            ("product_dim", product_df, "product_id"),
            ("customer_dim", customer_df, "customer_id"),
            ("branch_dim", branch_df, "branch_id"),
            ("sales_agent_dim", sales_agent_df, "sales_agent_id")
        ]
        save_dimensions(spark, dimensions)
        print("save dim")
        
        online_fact = process_online_fact(online_df, date_df, product_df, customer_df)
        offline_fact = process_offline_fact(offline_df, date_df, product_df, customer_df, branch_df, sales_agent_df)
        print("build facts")
        
        write_parquet(spark, online_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/online_sales_fact/online_fact", "transaction_id")
        write_parquet(spark, offline_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/offline_sales_fact/offline_fact", "transaction_id")
        
        print("Normalized model processing completed successfully.")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
    main()




get denorm data
process dim
/user/itversity/q-company_conformed_layer/normalized_model/product_dim/product_dim
True
root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- product_key: integer (nullable = true)

None
Appended 30 rows to /user/itversity/q-company_conformed_layer/normalized_model/product_dim/product_dim
/user/itversity/q-company_conformed_layer/normalized_model/customer_dim/customer_dim
True
root
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_key: integer (nullable = true)

None
Appended 101 rows to /user/itversity/q-company_conformed_layer/normalized_model/customer_dim/customer_dim
/user/itversity/q-company_conformed_layer/normalized_model/branch_dim/branch_dim
True
root
 |-- branch_id: long (nullable = true)
 |-- branch_location: string (nullable

In [6]:
import os
from pyspark.sql.functions import lit
from utils import Config
from typing import Tuple
from datetime import datetime
from utils import *

def get_latest_file(spark, base_path, transaction_type):
    current_date = datetime.now().strftime("%Y-%m-%d")
    print(f"Current date: {current_date}")
    
    file_pattern = f"{base_path}/standardized_sales_transaction_{current_date}/{transaction_type}_transactions*"
    print(f"Searching for files matching pattern: {file_pattern}")
    
    df = spark.read.parquet(file_pattern)
    
    latest_file = sorted(file_paths, key=lambda x: x.split('_')[-1], reverse=True)[0]
    print(f"Latest file: {latest_file}")
    
    return latest_file

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    base_path = "/user/itversity/q-company_standardized_layer"
    
    # Get latest online and offline files
    online_file = get_latest_file(spark, base_path, "online")
    offline_file = get_latest_file(spark, base_path, "offline")
    
    online_df = read_parquet_with_schema(spark, online_file)
    offline_df = read_parquet_with_schema(spark, offline_file)
    
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))
    
    return online_df_a, offline_df_a

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        changes_records.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
        print(f"Appended {changes_records.count()} rows to {file_path}")
    else:
        merged_df = df
        merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
        print(f"Appended {merged_df.count()} rows to {file_path}")
            

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

def main():
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
     main()

Current date: 2024-07-13
Searching for files matching pattern: /user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/online_transactions*
An error occurred: name 'file_paths' is not defined


In [50]:
import os
from pyspark.sql.functions import lit
from utils import Config
from typing import Tuple
from datetime import datetime
from utils import *

def get_latest_parquet_file(spark, hdfs_path, file_prefix):
    try:
        # List all files in the HDFS directory
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
                    .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        
        # Filter files by prefix and get only Parquet files
        parquet_files = [f.getPath().toString() for f in files if f.getPath().getName().startswith(file_prefix)]
        
        if not parquet_files:
            print(f"No matching Parquet files found in {hdfs_path}")
            return None
        
        # Sort the files by modification time
        latest_file = max(parquet_files, key=lambda x: os.path.basename(x))
        
        return latest_file
    
    except Exception as e:
        print(f"Error accessing path {hdfs_path}: {str(e)}")
        print("Directory contents:")
        try:
            fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
            status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
            for fileStatus in status:
                print(fileStatus.getPath().toString())
        except Exception as inner_e:
            print(f"Unable to list directory contents: {str(inner_e)}")
        return None

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    current_day = datetime.now().strftime("%Y-%m-%d")
    input_base_path = f"{Config.STANDARDIZED_BASE_PATH}/standardized_sales_transaction_{current_day}"
    
    # Get latest online file
    online_path = get_latest_parquet_file(spark, input_base_path, "online_transactions")
    if not online_path:
        raise ValueError(f"No online transaction files found in {input_base_path}")
    print(f"Latest online file: {online_path}")
    online_df = read_parquet_with_schema(spark, online_path)
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    
    # Get latest offline file
    offline_path = get_latest_parquet_file(spark, input_base_path, "offline_transactions")
    if not offline_path:
        raise ValueError(f"No offline transaction files found in {input_base_path}")
    print(f"Latest offline file: {offline_path}")
    offline_df = read_parquet_with_schema(spark, offline_path)
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))

    all_df = get_all_sales(online_df_a, offline_df_a)
    
    return online_df_a, offline_df_a, all_df

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
    else:
        merged_df = df
    
    print(merged_df.printSchema())
    merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
    print(f"Appended {df.count()} rows to {file_path}")

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

def main():
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
     main()

Latest online file: hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/online_transactions_group2_20240713063315
Latest offline file: hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/offline_transactions_group2_20240713063418
/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged
True
root
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_street_name: string (nullable = true)
 |-- shipping_city: string (nullab

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "<ipython-input-50-6f07ea2de6cc>", line 129, in main
    denorm_modeling(spark, offline_df, 'offline')
  File "<ipython-input-50-6f07ea2de6cc>", line 110, in denorm_modeling
    merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_id"]).mode("append").parquet(file_path)
  File "/opt/spark2/python/pyspark/sql/readwriter.py", line 847, in parquet
    self._jwrite.parquet(path)
  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1255, in __call__
    answer = self.gateway_client.send_command(command)
  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1152, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.6/socket.py", line 586, in read

KeyboardInterrupt: 

In [4]:
import os
from pyspark.sql.functions import lit
from typing import Tuple
from datetime import datetime
from Scripts.utils import *

def get_latest_parquet_file(spark, hdfs_path, file_prefix):
    try:
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
                    .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        
        parquet_files = [f.getPath().toString() for f in files if f.getPath().getName().startswith(file_prefix)]
        
        if not parquet_files:
            print(f"No matching Parquet files found in {hdfs_path}")
            return None
        
        #sorted_files = sorted(files, key=lambda f: f.getModificationTime(), reverse=True)
        #latest_file = sorted_files[0].getPath().toString() if sorted_files else None
        latest_file = max(parquet_files, key=lambda x: os.path.basename(x))
        
        return latest_file
    
    except Exception as e:
        print(f"Error accessing path {hdfs_path}: {str(e)}")
        print("Directory contents:")
        try:
            fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
            status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
            for fileStatus in status:
                print(fileStatus.getPath().toString())
        except Exception as inner_e:
            print(f"Unable to list directory contents: {str(inner_e)}")
        return None

def align_schemas(df1: DataFrame, df2: DataFrame) -> Tuple[DataFrame, DataFrame]:
    columns1 = set(df1.columns)
    columns2 = set(df2.columns)
    
    df2 = df2.select(*df2.columns, *[lit(None).alias(col) for col in columns1 - columns2])
    df1 = df1.select(*df1.columns, *[lit(None).alias(col) for col in columns2 - columns1])
    
    all_columns = sorted(list(columns1.union(columns2)))
    return df1.select(all_columns), df2.select(all_columns)

def read_parquet_with_schema(spark: SparkSession, path: str) -> DataFrame:
    schema_path = os.path.join(path, "_schema")
    schema_df = spark.read.parquet(schema_path)
    schema = schema_df.schema
    return spark.read.schema(schema).parquet(path)

def process_denormalized_model(spark: SparkSession) -> Tuple[DataFrame, DataFrame, DataFrame]:
    current_day = datetime.now().strftime("%Y-%m-%d")
    input_base_path = f"{Config.STANDARDIZED_BASE_PATH}/standardized_sales_transaction_{current_day}"
    
    # Get latest online file
    online_path = get_latest_parquet_file(spark, input_base_path, "online_transactions")
    if not online_path:
        raise ValueError(f"No online transaction files found in {input_base_path}")
    print(f"Latest online file: {online_path}")
    online_df = read_parquet_with_schema(spark, online_path)
    online_df_a = online_df.withColumn("transaction_type", lit("online"))
    
    offline_path = get_latest_parquet_file(spark, input_base_path, "offline_transactions")
    if not offline_path:
        raise ValueError(f"No offline transaction files found in {input_base_path}")
    print(f"Latest offline file: {offline_path}")
    offline_df = read_parquet_with_schema(spark, offline_path)
    offline_df_a = offline_df.withColumn("transaction_type", lit("offline"))

    all_df = get_all_sales(online_df_a, offline_df_a)
    
    return online_df_a, offline_df_a, all_df

def get_all_sales(online_df: DataFrame, offline_df: DataFrame) -> DataFrame:
    
    online_df_a, offline_df_a = align_schemas(online_df, offline_df)
    
    all_df = online_df_a.union(offline_df_a)
    
    new_order = [
        'transaction_id', 'transaction_date', 'transaction_type', 'customer_id', 'customer_name', 'customer_email',
        'product_id', 'product_name', 'product_category', 'units', 'unit_price', 'discount',
        'payment_method', 'group', 'sales_agent_id', 'sales_agent_name',
        'sales_agent_hire_date', 'branch_id', 'branch_location', 'branch_class',
        'shipping_street_name', 'shipping_city', 'shipping_state', 'shipping_zip_code'
    ]
    
    return all_df.select(new_order)

def denorm_modeling(spark: SparkSession, df: DataFrame, transaction_type: str) -> None:
    file_path = get_file_path(transaction_type)
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    path = spark._jvm.org.apache.hadoop.fs.Path(file_path)
    merged_file_exists = fs.exists(path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(file_path)
        changes_records = df.join(existing_df, on="transaction_id", how="left_anti")
        merged_df = existing_df.unionByName(changes_records)
        changes_records.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
    else:
        merged_df = df
        merged_df.write.option("schema", df.schema.json()).partitionBy(["transaction_date"]).mode("append").parquet(file_path)
        print(f"Appended {merged_df.count()} rows to {file_path}")
  
    print(merged_df.printSchema())

def get_file_path(transaction_type: str) -> str:
    denorm_path = Config.CONFORMED_DENORMALIZED_BASE_PATH
    if transaction_type == 'online':
        return f"{denorm_path}/online_fact_table/online_merged"
    elif transaction_type == 'offline':
        return f"{denorm_path}/offline_fact_table/offline_merged"
    else:
        return f"{denorm_path}/all_sales_fact_table/sales_merged"

def main():
    spark = SparkSession.builder.appName("DenormalizedModelProcessing").getOrCreate()
    
    try:
        online_df, offline_df, all_df = process_denormalized_model(spark)
        
        denorm_modeling(spark, online_df, 'online')
        denorm_modeling(spark, offline_df, 'offline')
        denorm_modeling(spark, all_df, 'all')
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()

if __name__ == "__main__":
     main()

Latest online file: hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/online_transactions_group2_20240713120140
Latest offline file: hdfs://localhost:9000/user/itversity/q-company_standardized_layer/standardized_sales_transaction_2024-07-13/offline_transactions_group2_20240713120344
/user/itversity/q-company_conformed_layer/denormalized_model/online_fact_table/online_merged
True
root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- units: long (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- discount: float (nullable = true)
 |-- total_price: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- shipping_street_name: string (nulla

In [6]:
def create_spark_session() -> SparkSession:
    return SparkSession.builder \
        .appName("Normalized_Model") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()
spark = create_spark_session()

In [12]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, date_format, row_number
from pyspark.sql.window import Window
from functools import reduce
from typing import List, Tuple
from Scripts.utils import Config
import os


def get_latest_parquet_file(spark, hdfs_path, file_prefix):
    try:
        files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
                    .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
        
        parquet_files = [f.getPath().toString() for f in files if f.getPath().getName().startswith(file_prefix)]
        
        if not parquet_files:
            print(f"No matching Parquet files found in {hdfs_path}")
            return None
        
        latest_file = max(parquet_files, key=lambda x: os.path.basename(x))
        
        return latest_file
    
    except Exception as e:
        print(f"Error accessing path {hdfs_path}: {str(e)}")
        print("Directory contents:")
        try:
            fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
            status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_path))
            for fileStatus in status:
                print(fileStatus.getPath().toString())
        except Exception as inner_e:
            print(f"Unable to list directory contents: {str(inner_e)}")
        return None

def create_spark_session() -> SparkSession:
    return SparkSession.builder \
        .appName("Normalized_Model") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .getOrCreate()

def read_parquet(spark: SparkSession, path: str) -> DataFrame:
    return spark.read.parquet(path)

def write_parquet(spark, df: DataFrame, path: str, on: str) -> None:
    
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
    file_path = spark._jvm.org.apache.hadoop.fs.Path(path)
    merged_file_exists = fs.exists(file_path)
    print(path)
    print(merged_file_exists)
    if merged_file_exists:
        existing_df = spark.read.parquet(path)
        changes_records = df.join(existing_df, on=on, how="left_anti")
        changes_records.write.mode('append').parquet(path)
        print(f"Appended {changes_records.count()} rows to {file_path}")
    else:
        merged_df = df
        merged_df.write.mode('overwrite').parquet(path)
        print(f"Appended {merged_df.count()} rows to {file_path}")
        print(merged_df.printSchema())

    

def add_surrogate_key(df: DataFrame, key_name: str, order_by: str) -> DataFrame:
    window_spec = Window.orderBy(order_by)
    return df.withColumn(key_name, row_number().over(window_spec))

def process_date_dimension(spark) -> DataFrame:
        return read_parquet(spark, "/user/itversity/q-company_conformed_layer/normalized_model/date_dim/date_dim_table")

def process_product_dimension(df: DataFrame) -> DataFrame:
    product_dim = df.select("product_id", "product_name", "product_category", "unit_price").distinct()
    return add_surrogate_key(product_dim, "product_key", "product_id")

def process_customer_dimension(df: DataFrame) -> DataFrame:
    customer_dim = df.select("customer_id", "customer_name", "customer_email").distinct()
    return add_surrogate_key(customer_dim, "customer_key", "customer_id")

def process_branch_dimension(df: DataFrame) -> DataFrame:
    branch_dim = df.select("branch_id", "branch_location", "branch_establish_date", "branch_class").distinct()
    return add_surrogate_key(branch_dim, "branch_key", "branch_id")

def process_sales_agent_dimension(df: DataFrame) -> DataFrame:
    sales_agent_dim = df.select("sales_agent_id", "sales_agent_hire_date", "sales_agent_name").distinct()
    return add_surrogate_key(sales_agent_dim, "sales_agent_key", "sales_agent_id")

def process_online_fact(online_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame) -> DataFrame:
    online_fact = online_df.select(
        "transaction_id", "customer_id", "product_id", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    online_fact = online_fact.join(date_df.select('date_key', 'date'), online_fact["transaction_date"] == date_df["date"], "inner") \
                             .join(product_df.select('product_key', 'product_id'), "product_id") \
                             .join(customer_df.select('customer_key', 'customer_id'), "customer_id")
        
    return online_fact.select(
        "transaction_id", "customer_key", "product_key", "date_key", "units", "unit_price", "discount",
        "payment_method", "group", "total_price", "shipping_zip_code", "shipping_state",
        "shipping_city", "shipping_street_name"
    )

def process_offline_fact(offline_df: DataFrame, date_df: DataFrame, product_df: DataFrame, customer_df: DataFrame, branch_df: DataFrame, sales_agent_df: DataFrame) -> DataFrame:
    offline_fact = offline_df.select(
        "transaction_id", "customer_id", "sales_agent_id", "branch_id", "product_id",
        "units", "unit_price", "discount", "payment_method", "total_price",
        date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )
    
    offline_fact = offline_fact.join(date_df.select('date_key', 'date'), offline_fact["transaction_date"] == date_df["date"], "inner") \
                               .join(product_df.select('product_key', 'product_id'), "product_id") \
                               .join(customer_df.select('customer_key', 'customer_id'), "customer_id") \
                               .join(branch_df.select('branch_key', 'branch_id'), "branch_id") \
                               .join(sales_agent_df.select('sales_agent_key', 'sales_agent_id'), "sales_agent_id")
    
    return offline_fact.select(
        "transaction_id", "customer_key", "sales_agent_key", "branch_key", "product_key", "date_key",
        "units", "unit_price", "discount", "payment_method", "total_price", date_format("transaction_date", "yyyy-MM-dd").alias("transaction_date")
    )

def process_dimensions(spark: SparkSession, all_df: DataFrame, offline_df: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    date_df = process_date_dimension(spark)
    product_df = process_product_dimension(all_df)
    customer_df = process_customer_dimension(all_df)
    branch_df = process_branch_dimension(offline_df)
    sales_agent_df = process_sales_agent_dimension(offline_df)
    
    return date_df, product_df, customer_df, branch_df, sales_agent_df

def save_dimensions(spark, dimensions: List[Tuple[str, DataFrame]]) -> None:
    for name, df, key in dimensions:
        write_parquet(spark, df, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/{name}/{name}", key)

def main():
    spark = create_spark_session()
    
    try:
        online_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/online_fact_table/online_merged")
        offline_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/offline_fact_table/offline_merged")
        all_df = read_parquet(spark, f"{Config.CONFORMED_DENORMALIZED_BASE_PATH}/all_sales_fact_table/sales_merged")
        print("get denorm data")
        
        date_df, product_df, customer_df, branch_df, sales_agent_df = process_dimensions(spark, all_df, offline_df)
        print("process dim")
        
        dimensions = [
            ("product_dim", product_df, "product_id"),
            ("customer_dim", customer_df, "customer_id"),
            ("branch_dim", branch_df, "branch_id"),
            ("sales_agent_dim", sales_agent_df, "sales_agent_id")
        ]
        save_dimensions(spark, dimensions)
        print("save dim")
        
        online_fact = process_online_fact(online_df, date_df, product_df, customer_df)
        offline_fact = process_offline_fact(offline_df, date_df, product_df, customer_df, branch_df, sales_agent_df)
        print("build facts")
        
        write_parquet(spark, online_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/online_sales_fact/online_fact", "transaction_id")
        write_parquet(spark, offline_fact, f"{Config.CONFORMED_NORMALIZED_BASE_PATH}/offline_sales_fact/offline_fact", "transaction_id")
        
        print("Normalized model processing completed successfully.")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        spark.stop()
        
        
        
        
        
        
        
        

if __name__ == "__main__":
    main()


get denorm data
process dim
/user/itversity/q-company_conformed_layer/normalized_model/product_dim/product_dim
True
Appended 0 rows to /user/itversity/q-company_conformed_layer/normalized_model/product_dim/product_dim
/user/itversity/q-company_conformed_layer/normalized_model/customer_dim/customer_dim
True
Appended 0 rows to /user/itversity/q-company_conformed_layer/normalized_model/customer_dim/customer_dim
/user/itversity/q-company_conformed_layer/normalized_model/branch_dim/branch_dim
True
Appended 0 rows to /user/itversity/q-company_conformed_layer/normalized_model/branch_dim/branch_dim
/user/itversity/q-company_conformed_layer/normalized_model/sales_agent_dim/sales_agent_dim
True
Appended 0 rows to /user/itversity/q-company_conformed_layer/normalized_model/sales_agent_dim/sales_agent_dim
save dim
build facts
/user/itversity/q-company_conformed_layer/normalized_model/online_sales_fact/online_fact
True
Appended 1000 rows to /user/itversity/q-company_conformed_layer/normalized_model/

In [None]:
spark.stop()