# Работа с JDBC базами данных
PostgreSQL, Oracle, Vertica

In [None]:
import os
from spark_config import get_spark_session, get_jdbc_url

spark = get_spark_session(app_name="JDBCDemo")

## PostgreSQL

In [None]:
# PostgreSQL connection
pg_url = get_jdbc_url('postgres', host='postgres', port=5432, database='spark_db')
pg_properties = {
    "user": os.environ.get('PG_USER', 'spark'),
    "password": os.environ.get('PG_PASSWORD', 'spark'),
    "driver": "org.postgresql.Driver"
}

print(f"PostgreSQL URL: {pg_url}")

In [None]:
# Read from PostgreSQL
# df_pg = spark.read.jdbc(url=pg_url, table="my_table", properties=pg_properties)
# df_pg.show()

In [None]:
# Write to PostgreSQL
# df.write.jdbc(url=pg_url, table="new_table", mode="overwrite", properties=pg_properties)

## Oracle

In [None]:
# Oracle connection
oracle_url = get_jdbc_url('oracle', host='oracle-db', port=1521, database='ORCL', service='ORCL')
oracle_properties = {
    "user": os.environ.get('ORACLE_USER', 'system'),
    "password": os.environ.get('ORACLE_PASSWORD', 'oracle'),
    "driver": "oracle.jdbc.OracleDriver"
}

print(f"Oracle URL: {oracle_url}")

In [None]:
# Read from Oracle with predicate pushdown
# df_oracle = spark.read.jdbc(
#     url=oracle_url,
#     table="(SELECT * FROM orders WHERE order_date > DATE '2024-01-01') subq",
#     properties=oracle_properties
# )

## Vertica

In [None]:
# Vertica connection
vertica_url = get_jdbc_url('vertica', host='vertica-host', port=5433, database='analytics')
vertica_properties = {
    "user": os.environ.get('VERTICA_USER', 'dbadmin'),
    "password": os.environ.get('VERTICA_PASSWORD', ''),
    "driver": "com.vertica.jdbc.Driver"
}

print(f"Vertica URL: {vertica_url}")

In [None]:
# Read from Vertica with partitioning for parallel reads
# df_vertica = spark.read.jdbc(
#     url=vertica_url,
#     table="fact_sales",
#     column="id",
#     lowerBound=0,
#     upperBound=1000000,
#     numPartitions=10,
#     properties=vertica_properties
# )

## Универсальная функция чтения

In [None]:
def read_from_db(spark, db_type, table, partition_column=None, num_partitions=10, **conn_params):
    """
    Universal function to read from different databases.
    
    Args:
        spark: SparkSession
        db_type: 'postgres', 'oracle', or 'vertica'
        table: table name or subquery
        partition_column: column for parallel reads
        num_partitions: number of partitions
        **conn_params: host, port, database, user, password
    """
    drivers = {
        'postgres': 'org.postgresql.Driver',
        'oracle': 'oracle.jdbc.OracleDriver',
        'vertica': 'com.vertica.jdbc.Driver'
    }
    
    url = get_jdbc_url(db_type, conn_params['host'], conn_params['port'], conn_params['database'])
    properties = {
        'user': conn_params['user'],
        'password': conn_params['password'],
        'driver': drivers[db_type]
    }
    
    reader = spark.read
    
    if partition_column:
        # Get bounds for partitioning
        bounds_df = spark.read.jdbc(
            url=url,
            table=f"(SELECT MIN({partition_column}) as min_val, MAX({partition_column}) as max_val FROM {table}) bounds",
            properties=properties
        ).collect()[0]
        
        return reader.jdbc(
            url=url,
            table=table,
            column=partition_column,
            lowerBound=bounds_df['min_val'],
            upperBound=bounds_df['max_val'],
            numPartitions=num_partitions,
            properties=properties
        )
    else:
        return reader.jdbc(url=url, table=table, properties=properties)

In [None]:
spark.stop()