In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder \
    .appName('PostgreSQL Snowflake') \
    .config('spark.jars', '/opt/spark/jars/postgresql-42.7.5.jar') \
    .getOrCreate()

In [3]:
properties = {'user': 'postgres', 'password': 'password', 'driver': 'org.postgresql.Driver'}
url = 'jdbc:postgresql://bigdata_pg:5432/postgres_db'
mock_data = spark.read.jdbc(url=url, table='mock_data', properties=properties)

In [4]:
customers = mock_data.select(
    col('sale_customer_id').alias('id'),
    col('customer_first_name').alias('first_name'),
    col('customer_last_name').alias('last_name'),
    col('customer_age').alias('age'),
    col('customer_email').alias('email'),
    col('customer_country').alias('country'),
    col('customer_postal_code').alias('postal_code')
).drop_duplicates(['id'])

customers.write.mode('append').jdbc(url=url, table='customers', properties=properties)

In [5]:
products = mock_data.select(
    col('sale_product_id').alias('id'),
    col('product_name').alias('name'),
    col('product_category').alias('category'),
    col('product_price').alias('price'),
    col('product_quantity').alias('quantity'),
    col('product_weight').alias('weight'),
    col('product_color').alias('color'),
    col('product_size').alias('size'),
    col('product_brand').alias('brand'),
    col('product_material').alias('material'),
    col('product_description').alias('description'),
    col('product_rating').alias('rating'),
    col('product_reviews').alias('reviews'),
    col('product_release_date').alias('release_date'),
    col('product_expiry_date').alias('expiry_date')
).drop_duplicates(['id'])

products.write.mode('append').jdbc(url=url, table='products', properties=properties)

In [6]:
sellers = mock_data.select(
    col('sale_seller_id').alias('id'),
    col('seller_first_name').alias('first_name'),
    col('seller_last_name').alias('last_name'),
    col('seller_email').alias('email'),
    col('seller_country').alias('country'),
    col('seller_postal_code').alias('postal_code')
).drop_duplicates(['id'])

sellers.write.mode('append').jdbc(url=url, table='sellers', properties=properties)

In [7]:
stores = mock_data.select(
    col('store_name').alias('name'),
    col('store_location').alias('location'),
    col('store_city').alias('city'),
    col('store_state').alias('state'),
    col('store_country').alias('country'),
    col('store_phone').alias('phone'),
    col('store_email').alias('email')
).drop_duplicates(['name'])

stores.write.mode('append').jdbc(url=url, table='stores', properties=properties)

In [8]:
suppliers = mock_data.select(
    col('supplier_name').alias('name'),
    col('supplier_contact').alias('contact'),
    col('supplier_email').alias('email'),
    col('supplier_phone').alias('phone'),
    col('supplier_address').alias('address'),
    col('supplier_city').alias('city'),
    col('supplier_country').alias('country')
).drop_duplicates(['name'])

suppliers.write.mode('append').jdbc(url=url, table='suppliers', properties=properties)

In [9]:
sales = mock_data.alias('mock_data').join(
    customers.alias('customers'),
    col('mock_data.sale_customer_id') == col('customers.id'),
    'inner'
).join(
    products.alias('products'),
    col('mock_data.sale_product_id') == col('products.id'),
    'inner'
).join(
    sellers.alias('sellers'),
    col('mock_data.sale_seller_id') == col('sellers.id'),
    'inner'
).join(
    stores.alias('stores'),
    col('mock_data.store_name') == col('stores.name'),
    'inner'
).join(
    suppliers.alias('suppliers'),
    col('mock_data.supplier_name') == col('suppliers.name'),
    'inner'
).select(
    col('mock_data.sale_date').alias('date'),
    col('customers.id').alias('customer_id'),
    col('products.id').alias('product_id'),
    col('sellers.id').alias('seller_id'),
    col('stores.name').alias('store_name'),
    col('suppliers.name').alias('supplier_name'),
    col('mock_data.sale_quantity').alias('quantity'),
    col('mock_data.sale_total_price').alias('total_price'),
)

sales.write.mode('append').jdbc(url=url, table='sales', properties=properties)

In [10]:
spark.stop()