In [1]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from pyspark.sql import SparkSession

# CONSTANTS
DB_NAME = 'drink_distribution_company'
DB_USER = 'postgres'
DB_PASSWORD = '1234'
DB_HOST = 'output-db'
DB_PORT = 5433

INPUT_PATH = 'storage/drink_distribution_company/parquet'
TABLE_PROPERTIES = [ # name, [PK], [FK]
    ('city', ['id'], []),
    ('inventory', ['id'], [('store_id','store(id)'), ('product_id','product(id)')]),
    ('invoice', ['id'], [('purchase_order_id', 'purchase_order(id)'),]),
    ('product', ['id'], []),
    ('purchase', ['id'], [('purchase_order_id', 'purchase_order(id)'),('inventory_id', 'inventory(id)'),]),
    ('purchase_order', ['id'], [('vendor_id', 'vendor(id)'),]),
    ('sale', ['id'], [('inventory_id', 'inventory(id)'),('vendor_id', 'vendor(id)'),]),
    ('store', ['id'], []),
    ('vendor', ['id'], []),
]

In [2]:
# Create the drink_distribution_company database if it doesn't exist
try:
    conn = psycopg2.connect(dbname='postgres', user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT)
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cur = conn.cursor()
    cur.execute(f"SELECT 1 FROM pg_database WHERE datname = '{DB_NAME}'")
    exists = cur.fetchone()
    if not exists:
        cur.execute(f'CREATE DATABASE {DB_NAME}')
        print(f"Database '{DB_NAME}' created.")
    else:
        print(f"Database '{DB_NAME}' already exists.")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error creating database: {e}")

Database 'drink_distribution_company' already exists.


In [3]:
spark= (
    SparkSession.builder.appName("write-to-db")
    .config("spark.jars", "/usr/local/spark/jars/postgresql.jar")
    .getOrCreate()
)

_df_raw={}
for table_name,_,_ in TABLE_PROPERTIES:
    filename=rf'{INPUT_PATH}/{table_name}'
    _df_raw[table_name]=spark.read.format('parquet').option("header",True).load(filename)
    jdbc_url = f'jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}'
    properties = {
        'user': f'{DB_USER}',
        'password': f'{DB_PASSWORD}',
        'driver': 'org.postgresql.Driver'
    }

    print(f'Writing {table_name} to PostgreSQL database...')
    _df_raw[table_name].write.jdbc(url=jdbc_url, table=table_name, mode="overwrite", properties=properties)
print('Finished!')

Writing city to PostgreSQL database...
Writing inventory to PostgreSQL database...
Writing invoice to PostgreSQL database...
Writing product to PostgreSQL database...
Writing purchase to PostgreSQL database...
Writing purchase_order to PostgreSQL database...
Writing sale to PostgreSQL database...
Writing store to PostgreSQL database...
Writing vendor to PostgreSQL database...
Finished!


In [7]:
# use psycopg2 to add PRIMARY KEY and FOREIGN KEY constraints
conn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT)
cur = conn.cursor()
for t, pk, _ in TABLE_PROPERTIES:
    if pk:
        n = f"{t}_pkey"
        cur.execute("SELECT 1 FROM information_schema.table_constraints WHERE table_name=%s AND constraint_name=%s", (t, n))
        if not cur.fetchone():
            pk_constraint_query = f'ALTER TABLE "{t}" ADD CONSTRAINT {n} PRIMARY KEY ({", ".join(pk)})'
            print(pk_constraint_query)
            cur.execute(pk_constraint_query)
    
for t, _, fks in TABLE_PROPERTIES:
    for c, r in fks:
        n = f"{t}_{c}_fkey"
        cur.execute("SELECT 1 FROM information_schema.table_constraints WHERE table_name=%s AND constraint_name=%s", (t, n))
        if not cur.fetchone():
            fk_constraint_query = f'ALTER TABLE "{t}" ADD CONSTRAINT {n} FOREIGN KEY ({c}) REFERENCES {r}'
            print(fk_constraint_query)
            cur.execute(fk_constraint_query)
conn.commit(); cur.close(); conn.close()

ALTER TABLE "city" ADD CONSTRAINT city_pkey PRIMARY KEY (id)
ALTER TABLE "inventory" ADD CONSTRAINT inventory_pkey PRIMARY KEY (id)
ALTER TABLE "invoice" ADD CONSTRAINT invoice_pkey PRIMARY KEY (id)
ALTER TABLE "product" ADD CONSTRAINT product_pkey PRIMARY KEY (id)
ALTER TABLE "purchase" ADD CONSTRAINT purchase_pkey PRIMARY KEY (id)
ALTER TABLE "purchase_order" ADD CONSTRAINT purchase_order_pkey PRIMARY KEY (id)
ALTER TABLE "sale" ADD CONSTRAINT sale_pkey PRIMARY KEY (id)
ALTER TABLE "store" ADD CONSTRAINT store_pkey PRIMARY KEY (id)
ALTER TABLE "vendor" ADD CONSTRAINT vendor_pkey PRIMARY KEY (id)
ALTER TABLE "inventory" ADD CONSTRAINT inventory_store_id_fkey FOREIGN KEY (store_id) REFERENCES store(id)
ALTER TABLE "inventory" ADD CONSTRAINT inventory_product_id_fkey FOREIGN KEY (product_id) REFERENCES product(id)
ALTER TABLE "invoice" ADD CONSTRAINT invoice_purchase_order_id_fkey FOREIGN KEY (purchase_order_id) REFERENCES purchase_order(id)
ALTER TABLE "purchase" ADD CONSTRAINT purchas