In [None]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from pyspark.sql import SparkSession

# CONSTANTS
DB_NAME = 'taxi_industry'
DB_USER = 'postgres'
DB_PASSWORD = '1234'
DB_HOST = 'output-db'
DB_PORT = 5433

INPUT_PATH = 'storage/taxi_industry/parquet_processed'

TABLE_PROPERTIES = [ # name, [PK], [FK]
    ('trip', ['id'], [
        ('VendorID', 'vendor("VendorID")'),
        ('RatecodeID', 'rate_code("RatecodeID")'),
        ('PULocationID', 'taxi_zone("LocationID")'),
        ('DOLocationID', 'taxi_zone("LocationID")'),
        ('payment_type', 'payment_type(payment_type)')
    ]),
    ('taxi_zone', ['"LocationID"'], []),
    ('vendor', ['"VendorID"'], []),
    ('rate_code', ['"RatecodeID"'], []),
    ('payment_type', ['"payment_type"'], []),
]

In [6]:
# Create the drink_distribution_company database if it doesn't exist
try:
    conn = psycopg2.connect(dbname='postgres', user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT)
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cur = conn.cursor()
    cur.execute(f"SELECT 1 FROM pg_database WHERE datname = '{DB_NAME}'")
    exists = cur.fetchone()
    if not exists:
        cur.execute(f'CREATE DATABASE {DB_NAME}')
        print(f"Database '{DB_NAME}' created.")
    else:
        print(f"Database '{DB_NAME}' already exists.")
    cur.close()
    conn.close()
except Exception as e:
    print(f"Error creating database: {e}")

Database 'taxi_industry' already exists.


In [14]:
spark= (
    SparkSession.builder.appName("write-to-db")
    .config("spark.jars", "/usr/local/spark/jars/postgresql.jar")
    .getOrCreate()
)

_df_raw={}
for table_name,_,_ in TABLE_PROPERTIES:
    filename=rf'{INPUT_PATH}/{table_name}'
    _df_raw[table_name]=spark.read.format('parquet').option("header",True).load(filename)
    jdbc_url = f'jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}'
    properties = {
        'user': f'{DB_USER}',
        'password': f'{DB_PASSWORD}',
        'driver': 'org.postgresql.Driver'
    }

    print(f'Writing {table_name} to PostgreSQL database...')
    _df_raw[table_name].write.jdbc(url=jdbc_url, table=table_name, mode="overwrite", properties=properties)
print('Finished!')

Writing trip to PostgreSQL database...
Finished!


In [4]:
# use psycopg2 to add PRIMARY KEY and FOREIGN KEY constraints
conn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT)
cur = conn.cursor()
for t, pk, _ in TABLE_PROPERTIES:
    if pk:
        n = f"{t}_pkey"
        cur.execute("SELECT 1 FROM information_schema.table_constraints WHERE table_name=%s AND constraint_name=%s", (t, n))
        if not cur.fetchone():
            pk_constraint_query = f'ALTER TABLE "{t}" ADD CONSTRAINT {n} PRIMARY KEY ({", ".join(pk)})'
            print(pk_constraint_query)
            cur.execute(pk_constraint_query)
    
for t, _, fks in TABLE_PROPERTIES:
    for c, r in fks:
        n = f"{t}_{c}_fkey"
        cur.execute("SELECT 1 FROM information_schema.table_constraints WHERE table_name=%s AND constraint_name=%s", (t, n))
        if not cur.fetchone():
            # Ensure column names are quoted for case sensitivity
            fk_constraint_query = f'ALTER TABLE "{t}" ADD CONSTRAINT {n} FOREIGN KEY ("{c}") REFERENCES {r}'
            print(fk_constraint_query)
            cur.execute(fk_constraint_query)
conn.commit(); cur.close(); conn.close()

ALTER TABLE "trip" ADD CONSTRAINT trip_pkey PRIMARY KEY (id)
ALTER TABLE "taxi_zone" ADD CONSTRAINT taxi_zone_pkey PRIMARY KEY ("LocationID")
ALTER TABLE "vendor" ADD CONSTRAINT vendor_pkey PRIMARY KEY ("VendorID")
ALTER TABLE "rate_code" ADD CONSTRAINT rate_code_pkey PRIMARY KEY ("RatecodeID")
ALTER TABLE "payment_type" ADD CONSTRAINT payment_type_pkey PRIMARY KEY ("payment_type")
ALTER TABLE "trip" ADD CONSTRAINT trip_VendorID_fkey FOREIGN KEY ("VendorID") REFERENCES vendor("VendorID")
ALTER TABLE "trip" ADD CONSTRAINT trip_RatecodeID_fkey FOREIGN KEY ("RatecodeID") REFERENCES rate_code("RatecodeID")
ALTER TABLE "trip" ADD CONSTRAINT trip_PULocationID_fkey FOREIGN KEY ("PULocationID") REFERENCES taxi_zone("LocationID")
ALTER TABLE "trip" ADD CONSTRAINT trip_DOLocationID_fkey FOREIGN KEY ("DOLocationID") REFERENCES taxi_zone("LocationID")
ALTER TABLE "trip" ADD CONSTRAINT trip_payment_type_fkey FOREIGN KEY ("payment_type") REFERENCES payment_type(payment_type)
