In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, row_number, coalesce, when, first, monotonically_increasing_id,current_timestamp
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("DWH Population") \
    .config("spark.driver.extraClassPath", "/home/jovyan/postgresql-42.7.4.jar") \
    .getOrCreate()

jdbc_url = "jdbc:postgresql://localhost:5432/postgres"
properties = {
    "user": "postgres",
    "password": "mysecretpassword",
    "driver": "org.postgresql.Driver"
}



In [5]:
df_first = spark.read.jdbc(url=jdbc_url, table="source1.craft_market_wide", properties=properties)

# Load data from the second database
df_second_masters_products = spark.read.jdbc(url=jdbc_url, table="source2.craft_market_masters_products", properties=properties)
df_second_orders_customers = spark.read.jdbc(url=jdbc_url, table="source2.craft_market_orders_customers", properties=properties)
df_second = df_second_orders_customers.join(df_second_masters_products, on="craftsman_id", how="inner") 

# Load data from the third database
df_third_craftsmans = spark.read.jdbc(url=jdbc_url, table="source3.craft_market_craftsmans", properties=properties)
df_third_customers = spark.read.jdbc(url=jdbc_url, table="source3.craft_market_customers", properties=properties)
df_third_orders = spark.read.jdbc(url=jdbc_url, table="source3.craft_market_orders", properties=properties)
df_third = df_third_orders \
    .join(df_third_craftsmans, on="craftsman_id", how="inner") \
    .join(df_third_customers, on="customer_id", how="inner")

In [6]:
columns = ['craftsman_name', 'craftsman_address', 'craftsman_birthday', 'craftsman_email',
           'product_name', 'product_description', 'product_type', 'product_price',
           'order_created_date', 'order_completion_date', 'order_status',
           'customer_name', 'customer_address', 'customer_birthday', 'customer_email']
dfs = [df_first, df_second, df_third]
dfs = [df.select([col(c) if c in df.columns else lit(None).alias(c) for c in columns]) for df in dfs]


combined_df = dfs[0].union(dfs[1]).union(dfs[2])

final_df = combined_df


In [7]:
# Step 2: Create d_craftsmans table
window_craftsman = Window.orderBy("craftsman_name", "craftsman_birthday")
d_craftsmans = final_df.select("craftsman_name", "craftsman_address", "craftsman_birthday", "craftsman_email") \
    .dropDuplicates(["craftsman_name", "craftsman_birthday"]) \
    .withColumn("craftsman_id", row_number().over(window_craftsman))

# Step 2: Create d_customers table
window_customer = Window.orderBy("customer_name", "customer_birthday")
d_customers = final_df.select("customer_name", "customer_address", "customer_birthday", "customer_email") \
    .dropDuplicates(["customer_name", "customer_birthday"]) \
    .withColumn("customer_id", row_number().over(window_customer))

window_product = Window.orderBy("product_name", "product_price")
# Step 3: Create d_products table
d_products = final_df.select("product_name", "product_description", "product_type", "product_price") \
    .dropDuplicates(["product_name", "product_price"]) \
    .withColumn("product_id", row_number().over(window_product))

# We need to join the original DataFrame with the new tables based on product_name, craftsman_name, and customer_name
f_orders = final_df.join(d_products, ["product_name", "product_price"], "left") \
    .join(d_craftsmans, ["craftsman_name", "craftsman_birthday"], "left") \
    .join(d_customers, ["customer_name", "customer_birthday"], "left") \
    .select(
        row_number().over(Window.orderBy("product_id", "craftsman_id", "customer_id")).alias("order_id"),
        "product_id",
        "craftsman_id",
        "customer_id",
        "order_created_date",
        "order_completion_date",
        "order_status"
    )

# Show the resulting DataFrames
d_craftsmans = d_craftsmans.withColumn("load_dttm", current_timestamp())
d_customers = d_customers.withColumn("load_dttm", current_timestamp())
d_products = d_products.withColumn("load_dttm", current_timestamp())
f_orders = f_orders.withColumn("load_dttm", current_timestamp())

print(f"{final_df.count()=} {d_craftsmans.count()=} {d_customers.count()=} {d_products.count()=} {f_orders.count()=}")


final_df.count()=3006 d_craftsmans.count()=2999 d_customers.count()=3001 d_products.count()=2994 f_orders.count()=3006


In [9]:
# Step 1: Write d_craftsmans table to DWH
d_craftsmans.write.jdbc(url=jdbc_url, table="dwh.d_craftsmans", mode="append", properties=properties)
# Step 2: Write d_customers table to DWH
d_customers.write.jdbc(url=jdbc_url, table="dwh.d_customers", mode="append", properties=properties)
# Step 3: Write d_products table to DWH
d_products.write.jdbc(url=jdbc_url, table="dwh.d_products", mode="append", properties=properties)
# Step 4: Write f_orders table to DWH
f_orders.write.jdbc(url=jdbc_url, table="dwh.f_orders", mode="append", properties=properties)