In [1]:
import os

from delta import DeltaTable, configure_spark_with_delta_pip
from dotenv import load_dotenv
from pyspark.sql import SparkSession, types

In [2]:
load_dotenv()

True

## Configure spark

In [3]:
builder = (
    SparkSession.builder.appName("DeltaTableMigration")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:10001")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.hs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Load Data Base Tables into Global Temp

In [4]:
db_user = os.environ["DB_USER"]
db_password = os.environ["DB_PASSWORD"]
db_host = os.environ["DB_HOST"]
db_name = os.environ["DB_NAME"]
db_port = os.environ.get("DB_PORT", "5432")

In [5]:
jdbc_url = f"jdbc:postgresql://{db_host}:{db_port}/{db_name}?user={db_user}&password={db_password}"

In [6]:
sql_strs = [
    f"""
        CREATE OR REPLACE GLOBAL TEMPORARY VIEW {table_name}
        USING JDBC
        OPTIONS (
            driver 'org.postgresql.Driver',
            fetchsize '100000',
            url '{jdbc_url}',
            dbtable '{table_name}'
        )
    """
    for table_name in [
        "appropriation_account_balances", "submission_attributes", "treasury_appropriation_account", "cgac", "federal_account", "toptier_agency"
    ]]

In [7]:
for sql_str in sql_strs:
    spark.sql(sql_str)

In [8]:
aab = spark.table("global_temp.appropriation_account_balances")

In [9]:
aab.toPandas()

Unnamed: 0,data_source,appropriation_account_balances_id,budget_authority_unobligated_balance_brought_forward_fyb,adjustments_to_unobligated_balance_brought_forward_cpe,budget_authority_appropriated_amount_cpe,borrowing_authority_amount_total_cpe,contract_authority_amount_total_cpe,spending_authority_from_offsetting_collections_amount_cpe,other_budgetary_resources_amount_cpe,total_budgetary_resources_amount_cpe,...,drv_other_obligated_amount,reporting_period_start,reporting_period_end,last_modified_date,certified_date,create_date,update_date,final_of_fy,submission_id,treasury_account_identifier
0,DBR,262923,807556.02,0.00,0.00,0.00,0.00,0.00,0.00,807556.02,...,,2018-01-01,2018-03-31,,,2021-03-09 14:51:58.436266,2021-03-09 14:51:58.436297,False,9663,57055
1,DBR,262924,870442.53,0.00,0.00,0.00,0.00,0.00,0.00,870442.53,...,,2018-01-01,2018-03-31,,,2021-03-09 14:51:58.436525,2021-03-09 14:51:58.436546,False,9663,60130
2,DBR,262925,821408.03,1260.00,0.00,0.00,0.00,0.00,0.00,822668.03,...,,2018-01-01,2018-03-31,,,2021-03-09 14:51:58.436763,2021-03-09 14:51:58.436782,False,9663,63002
3,DBR,262926,794352.55,0.00,0.00,0.00,0.00,0.00,0.00,794352.55,...,,2018-01-01,2018-03-31,,,2021-03-09 14:51:58.437018,2021-03-09 14:51:58.437040,False,9663,65521
4,DBR,262927,1388843.04,239382.54,0.00,0.00,0.00,0.00,0.00,1628225.58,...,,2018-01-01,2018-03-31,,,2021-03-09 14:51:58.437261,2021-03-09 14:51:58.437281,False,9663,69746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489796,DBR,766105,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,,2024-08-01,2024-08-31,,,2024-10-03 01:04:37.077141,2024-10-03 01:04:37.077147,True,80015,73387
489797,DBR,766106,1339563.20,0.00,0.00,0.00,0.00,0.00,0.00,1339563.20,...,,2024-08-01,2024-08-31,,,2024-10-03 01:04:37.077200,2024-10-03 01:04:37.077206,True,80015,70461
489798,DBR,766107,14547.14,0.00,0.00,0.00,0.00,0.00,0.00,14547.14,...,,2024-08-01,2024-08-31,,,2024-10-03 01:04:37.077259,2024-10-03 01:04:37.077265,True,80015,11489
489799,DBR,766108,0.00,33707.60,144457785222.91,0.00,0.00,0.00,0.00,144457818930.51,...,,2024-08-01,2024-08-31,,,2024-10-03 01:04:37.077318,2024-10-03 01:04:37.077324,True,80015,11487


## Load Table From minio

In [10]:
award_search = spark.read.format("delta").load("s3a://data/data/delta/rpt/award_search")

In [11]:
award_search.toPandas()

Unnamed: 0,treasury_account_identifiers,award_id,data_source,transaction_unique_id,latest_transaction_id,earliest_transaction_id,latest_transaction_search_id,earliest_transaction_search_id,category,type_raw,...,officer_3_name,officer_4_amount,officer_4_name,officer_5_amount,officer_5_name,total_iija_outlay,total_iija_obligation,total_outlays,generated_pragmatic_obligation,program_activities
0,,24368000,DBR,9700_9700_HX36_0_SPM2DV11D9209_0,11662923,11662923,11662923,11662923,contract,C,...,,,,,,,,,,
1,,24581000,DBR,1434_-NONE-_ING04ERSA0502_0_-NONE-_0,40469183,40469183,40469183,40469183,contract,B,...,,,,,,,,,,
2,[65896],24509500,DBR,1448_-NONE-_INF16PX01429_2_-NONE-_0,124834180,2361400,124834180,2361400,contract,B,...,,,,,,,,,,
3,,24688000,DBR,1422_4730_INL11PD00087_1_GS35F0593V_0,31676342,51033863,31676342,51033863,contract,C,...,,,,,,,,,,
4,,25030500,DBR,9700_9700_J598_0_SPM2DV11D9200_0,2821594,2821594,2821594,2821594,contract,C,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329787,[11486],56917500,DBR,-NONE-_2800_-NONE-_2800201711300000013966,70010304,70010304,70010304,70010304,direct payment,10,...,,,,,,,,,,
329788,[11486],56911500,DBR,-NONE-_2800_-NONE-_2800201711300000007963,58871738,58871738,58871738,58871738,direct payment,10,...,,,,,,,,,,
329789,,58693500,DBR,-NONE-_3640_VAVBASURVPENSAUG2017_6410520170809388,71074793,71074793,71074793,71074793,direct payment,10,...,,,,,,,,,,
329790,,59583500,DBR,-NONE-_3640_VAVBAVOCREHAB FEB2017_64116201702...,73490902,73490902,73490902,73490902,direct payment,10,...,,,,,,,,,,
