In [1]:
import os
import io
from math import ceil
import pandas as pd
import requests
import dotenv
import mssql_python

dotenv.load_dotenv()

# API Configuration
EXPORT_URL = "https://opendata.maryland.gov/api/v3/views/gy87-e27x/export.csv"
APP_TOKEN = os.getenv("MD_APP_TOKEN")

# Database Configuration
SQL_CONNECTION_STRING = os.getenv("SQL_CONNECTION_STRING")
if not SQL_CONNECTION_STRING:
    raise ValueError(
        "SQL_CONNECTION_STRING environment variable is required. "
        "Set it in your .env file or as an environment variable."
    )

TABLE_NAME = "[Maryland].[dbo].[OperatingBudget]"

# Column Definitions with inferred data types and comments
# Dataset: Maryland Operating Budget - Funding Source (FY2017-2026)
# Source: Department of Budget and Management (DBM) via opendata.maryland.gov
# Note: Budget values are in actual dollars (not millions)
COLUMN_DEFS = {
    "fiscal_year": "INT NOT NULL COMMENT 'Fiscal year (2017-2026): FY2017-2024 actual expenditures, FY2025 working, FY2026 Governor allowance'",
    "agency_code": "NVARCHAR(25) COMMENT 'Agency identifier code'",
    "agency_name": "NVARCHAR(255) COMMENT 'Agency name'",
    "unit_code": "NVARCHAR(25) COMMENT 'Unit/Division identifier code'",
    "unit_name": "NVARCHAR(255) COMMENT 'Unit/Division name'",
    "program_code": "NVARCHAR(25) COMMENT 'Program identifier code'",
    "program_name": "NVARCHAR(255) COMMENT 'Program name'",
    "fund_type_name": "NVARCHAR(255) COMMENT 'Type of fund'",
    "fund_source_code": "NVARCHAR(25) COMMENT 'Funding source identifier code'",
    "fund_source_name": "NVARCHAR(255) COMMENT 'Funding source name'",
    "budget": "DECIMAL(18,2) NOT NULL COMMENT 'Budget amount in actual dollars (not millions)'",
    "organization_code": "NVARCHAR(25) COMMENT 'Organization identifier code'",
    "description": "NVARCHAR(MAX) COMMENT 'Budget item description'",
    "category": "INT COMMENT 'Category identifier'",
    "category_title": "NVARCHAR(255) COMMENT 'Category title'",
}

COLUMN_ALIASES = {
    "Fiscal Year": "fiscal_year",
    "Agency Code": "agency_code",
    "Agency Name": "agency_name",
    "Unit Code": "unit_code",
    "Unit Name": "unit_name",
    "Program Code": "program_code",
    "Program Name": "program_name",
    "Fund Type Name": "fund_type_name",
    "Fund Source Code": "fund_source_code",
    "Fund Source Name": "fund_source_name",
    "Budget": "budget",
    "Organization Code": "organization_code",
    "Description": "description",
    "Category": "category",
    "Category Title": "category_title",
}

print("✓ All imports and configuration loaded")
print("✓ Dataset: Maryland Operating Budget - Funding Source (FY2017-2026)")
print("✓ Source: Department of Budget and Management (DBM)")
print("✓ Budget values are in actual dollars (single digit precision)")

✓ All imports and configuration loaded
✓ Dataset: Maryland Operating Budget - Funding Source (FY2017-2026)
✓ Source: Department of Budget and Management (DBM)
✓ Budget values are in actual dollars (single digit precision)


In [2]:
# Download and load dataset
headers = {"X-App-Token": APP_TOKEN} if APP_TOKEN else {}
response = requests.get(EXPORT_URL, params={"accessType": "DOWNLOAD"}, headers=headers, timeout=120)
response.raise_for_status()

df = pd.read_csv(io.BytesIO(response.content))

print(f"✓ Dataset loaded: {len(df):,} rows × {len(df.columns)} columns\n")
print("Random sample of 10 records:")
df.sample(10)

✓ Dataset loaded: 23,740 rows × 15 columns

Random sample of 10 records:


Unnamed: 0,Fiscal Year,Agency Code,Agency Name,Unit Code,Unit Name,Program Code,Program Name,Fund Type Name,Fund Source Code,Fund Source Name,Budget,Organization Code,Description,Category,Category Title
4144,2018,P00,"Department of Labor, Licensing, and Regulation",B01,Division of Administration,1,Office of Administration,Federal Funds,84.002,Adult Education-Basic Grants to States,84258,P00_B01_01,The Office of Administration (OOA) program pro...,10,Other
13924,2022,S00,Department of Housing and Community Development,A27,Division of Finance and Administration,1,Finance and Administration,Special Funds,S00321,Special Loan Program Fund,284113,S00_A27_01,The program provides critical departmental sup...,5,Human Services
3555,2018,M00,Maryland Department of Health,I04,Deer's Head Center,1,Services and Institutional Operations,Special Funds,M00314,Renal Dialysis Collections,2115343,M00_I04_01,Deer’s Head Hospital Center (DHHC) provides: c...,1,Health
22700,2026,N00,Department of Human Services,A01,Office of the Secretary,1,Office of the Secretary,Federal Funds,93.658,Foster Care-Title IV-E,1080344,N00_A01_01,The Office of the Secretary provides overall d...,5,Human Services
20176,2025,M00,Maryland Department of Health,F03,Prevention and Health Promotion Administration,4,Family Health and Chronic Disease Services,Federal Funds,93.334,BOLD Public Health Strategies to Address Alzhe...,370383,M00_F03_04,The Family Health and Chronic Disease Services...,1,Health
22727,2026,N00,Department of Human Services,E01,Operations Office,1,"Division of Budget, Finance and Personnel",Federal Funds,93.558,Temporary Assistance for Needy Families,2962787,N00_E01_01,The Division supports the programs of other De...,5,Human Services
22584,2026,M00,Maryland Department of Health,L01,Behavioral Health Administration,2,Community Services,Federal Funds,93.665,Emergency Grants to Address Mental and Substan...,347930,M00_L01_02,This program provides funding for grants-based...,1,Health
14943,2023,K00,Department of Natural Resources,A01,Office of the Secretary,5,Information Technology Service,Special Funds,K00342,Waterway Improvement Fund,49010,K00_A01_05,The Information Technology Service provides sy...,7,Natural Resources and Environment
8406,2020,P00,Maryland Department of Labor,A01,Office of the Secretary,5,Legal Services,Special Funds,P00312,Workers' Compensation Commission,68486,P00_A01_05,The Legal Services program is the centralized ...,10,Other
19651,2025,J00,Department of Transportation,H01,Maryland Transit Administration,5,Facilities and Capital Equipment,Federal Funds,20.513,Enhanced Mobility of Seniors and Individuals w...,7114444,J00_H01_05,This program includes the following organizati...,4,Transportation


In [6]:
# Load all years to SQL
df_sql = df.rename(columns=COLUMN_ALIASES)
df_sql.columns = [col.strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_") for col in df_sql.columns]

# Convert numeric columns with proper datatype handling
# Fiscal Year: INT - no decimal places needed
for col in ["fiscal_year", "category"]:
    df_sql[col] = pd.to_numeric(df_sql[col], errors="coerce").astype("Int64")

# Budget: DECIMAL(18,2) - actual dollars, not millions
# Keep two decimal places for cents
# Remove commas from budget strings before converting to numeric
df_sql["budget"] = df_sql["budget"].astype(str).str.replace(",", "", regex=False)
df_sql["budget"] = pd.to_numeric(df_sql["budget"], errors="coerce").astype("float").round(2)

# Filter: budget IS NOT NULL AND budget > 0
df_sql = df_sql[(df_sql["budget"].notna()) & (df_sql["budget"] > 0)]

# Sort by fiscal year to process data in order
df_sql = df_sql.sort_values("fiscal_year").reset_index(drop=True)

# Connect and truncate table BEFORE starting batch inserts
conn = mssql_python.connect(SQL_CONNECTION_STRING)
cursor = conn.cursor()

print("✓ Connecting to database...")
cursor.execute(f"TRUNCATE TABLE {TABLE_NAME}")
conn.commit()
print("✓ Table truncated successfully - ready for batch insert")

# Prepare insert statement
ordered_cols = ["fiscal_year", "agency_code", "agency_name", "unit_code", "unit_name", 
                "program_code", "program_name", "fund_type_name", "fund_source_code", 
                "fund_source_name", "budget", "organization_code", "description", "category", "category_title"]
insert_sql = f"INSERT INTO {TABLE_NAME} ({', '.join(f'[{col}]' for col in ordered_cols)}) VALUES ({', '.join(['?'] * len(ordered_cols))})"

# Convert DataFrame to list of tuples
records = df_sql[ordered_cols].astype(object).where(pd.notnull(df_sql[ordered_cols]), None).values.tolist()

BATCH_SIZE = 1000
total_batches = ceil(len(records) / BATCH_SIZE)
total_inserted = 0

print(f"\n✓ Starting batch insert of {len(records):,} records in {total_batches} batches...\n")

# Process batches - each batch is committed immediately
for i in range(0, len(records), BATCH_SIZE):
    batch = records[i:i+BATCH_SIZE]
    cursor.executemany(insert_sql, batch)
    conn.commit()
    total_inserted += len(batch)
    batch_num = (i // BATCH_SIZE) + 1
    print(f"  ✓ Batch {batch_num}/{total_batches}: {len(batch)} records inserted - Running total: {total_inserted:,}")

cursor.close()
conn.close()

print(f"\n✓ Successfully loaded {total_inserted:,} records to {TABLE_NAME}")
print(f"✓ Budget precision: 2 decimal places (actual dollars)")
print(f"✓ Data sorted by fiscal year: {df_sql['fiscal_year'].min():.0f} to {df_sql['fiscal_year'].max():.0f}")
print(f"✓ All {total_batches} batches committed successfully")

✓ Connecting to database...
✓ Table truncated successfully - ready for batch insert

✓ Starting batch insert of 23,698 records in 24 batches...

  ✓ Batch 1/24: 1000 records inserted - Running total: 1,000
  ✓ Batch 2/24: 1000 records inserted - Running total: 2,000
  ✓ Batch 3/24: 1000 records inserted - Running total: 3,000
  ✓ Batch 2/24: 1000 records inserted - Running total: 2,000
  ✓ Batch 3/24: 1000 records inserted - Running total: 3,000
  ✓ Batch 4/24: 1000 records inserted - Running total: 4,000
  ✓ Batch 4/24: 1000 records inserted - Running total: 4,000
  ✓ Batch 5/24: 1000 records inserted - Running total: 5,000
  ✓ Batch 5/24: 1000 records inserted - Running total: 5,000
  ✓ Batch 6/24: 1000 records inserted - Running total: 6,000
  ✓ Batch 7/24: 1000 records inserted - Running total: 7,000
  ✓ Batch 6/24: 1000 records inserted - Running total: 6,000
  ✓ Batch 7/24: 1000 records inserted - Running total: 7,000
  ✓ Batch 8/24: 1000 records inserted - Running total: 8,000
 