# Get General Ledger Data
Source: https://data.ok.gov/dataset/general-ledger

## Imports

In [None]:
from __future__ import annotations
from typing import List, Tuple
import os
from pathlib import Path
import subprocess
import requests
from rich import print
import pandas as pd
import duckdb

# file encoding (detected mixed encoding in the hosted files)
from charset_normalizer import detect

## Constants

In [None]:
# Oklahoma Government dataset
DOMAIN = "https://data.ok.gov"
ACTION = "/api/3/action/package_show"
URL = f"{DOMAIN}/{ACTION}"
PARAMS = {
    "id": "general-ledger"
}

# Local storage
DATA_DIR = Path("../data/gl")
DATA_RAW = DATA_DIR / "01_raw"
DATA_SILVER = DATA_DIR / "02_silver"
DATA_GOLD = DATA_DIR / "03_gold"
LOCAL_DB = Path("../data/local_db")
LOCAL_DATA = [DATA_DIR, DATA_RAW, DATA_SILVER, DATA_GOLD, LOCAL_DB]

for p in LOCAL_DATA:
    p.mkdir(parents=True, exist_ok=True)

# Local DB connection
con = duckdb.connect(f"{str(LOCAL_DB)}/ledger.duckdb")
con

## Available Resources
- Datasets available from FY22 to FY26 Q3
  - FY26 most likely to be incomplete
- Extract the filename and download urls

In [4]:
# Retrieve filenames and download urls
response = requests.get(url=URL, params=PARAMS)
response.raise_for_status()

data = response.json()\
    .get('result', [])\
    .get('resources', [])

# filename and download url
url_collections = [
    (f['name'], f['url']) for f in data
    if f['url'].lower().endswith('.csv')
]

print(url_collections)

## Download Locally
- Store in `raw` and write to duckdb
- Use `curl` to download the files in bulk
- Files have varying encoding types (`ascii` and `windows-1250`)
  - Handle in cleaning phase

In [5]:
def download_files(url_collections: List[Tuple[str, str]]):
    # download the data and provide the filename
    for fname, url in url_collections:
        fname = str((DATA_RAW) / fname)
        subprocess.run(
            args=['curl', '-L', '-o', fname, url],
            check=True
        )
        yield fname


# Raw local db data
table_created = False

# Download files / append to duckdb
for file in download_files(url_collections):
    try:
        # Files appear to be ascii encoded
        df = pd.read_csv(file, encoding="ascii")
    except UnicodeDecodeError:
        try:
            print(f"Retrying {file=} with a different encoding")
            with open(file, 'rb') as f:
                encoding = detect(f.read())['encoding']
                df = pd.read_csv(file, encoding=encoding)
        except Exception as e:
            print(f"Failed to process {file}: {e}")
            continue
    
    # Create the initial table
    if not table_created:
        con.execute("CREATE TABLE ledger_raw AS SELECT * FROM df LIMIT 0")
        table_created = True

    # Append to the table
    try:
        con.execute("INSERT INTO ledger_raw SELECT * FROM df")
    except Exception as e:
        print(f"Failed to append {file} to DuckDB: {e}")
    
    print(f"Saved and appended {file=}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4166      0 --:--:-- --:--:-- --:--:--  4173
100  121M  100  121M    0     0  13.2M      0  0:00:09  0:00:09 --:--:-- 14.0M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4881      0 --:--:-- --:--:-- --:--:--  4887
100 98.9M  100 98.9M    0     0  13.1M      0  0:00:07  0:00:07 --:--:-- 15.2M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4246      0 --:--:-- --:--:-- --:--:--  4254
100  160M  100  160M    0     0  15.8M      0  0:00:10  0:00:10 --:--:-- 17.5M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4292      0 --:--:-- --:--:-- --:--:--  4288
100 15.9M  100 15.9M    0     0  11.5M      0  0:00:01  0:00:01 --:--:-- 16.5M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4667      0 --:--:-- --:--:-- --:--:--  4676
100  160M  100  160M    0     0  14.0M      0  0:00:11  0:00:11 --:--:-- 16.0M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4367      0 --:--:-- --:--:-- --:--:--  4375
100  185M  100  185M    0     0  14.9M      0  0:00:12  0:00:12 --:--:-- 16.6M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5686      0 --:--:-- --:--:-- --:--:--  5680
100  156M  100  156M    0     0  14.9M      0  0:00:10  0:00:10 --:--:-- 16.4M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   1354      0 --:--:-- --:--:-- --:--:--  1354
100  164M  100  164M    0     0  12.3M      0  0:00:13  0:00:13 --:--:-- 13.0M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4927      0 --:--:-- --:--:-- --:--:--  4909
100  128M  100  128M    0     0  14.2M      0  0:00:08  0:00:08 --:--:-- 15.2M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5254      0 --:--:-- --:--:-- --:--:--  5241
100  108M  100  108M    0     0  13.1M      0  0:00:08  0:00:08 --:--:-- 12.9M


  df = pd.read_csv(file, encoding=encoding)


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4376      0 --:--:-- --:--:-- --:--:--  4392
100  152M  100  152M    0     0  6225k      0  0:00:25  0:00:25 --:--:-- 6683k
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5201      0 --:--:-- --:--:-- --:--:--  5191
100  214M  100  214M    0     0  15.5M      0  0:00:13  0:00:13 --:--:-- 16.2M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   3984      0 --:--:-- --:--:-- --:--:--  3974
100  210M  100  210M    0     0  14.8M      0  0:00:14  0:00:14 --:--:-- 16.9M


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4607      0 --:--:-- --:--:-- --:--:--  4617
100  204M  100  204M    0     0  13.7M      0  0:00:14  0:00:14 --:--:-- 15.3M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5481      0 --:--:-- --:--:-- --:--:--  5479
100  154M  100  154M    0     0  14.6M      0  0:00:10  0:00:10 --:--:-- 14.4M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4366      0 --:--:-- --:--:-- --:--:--  4375
100  141M  100  141M    0     0  13.3M      0  0:00:10  0:00:10 --:--:-- 14.5M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   4493      0 --:--:-- --:--:-- --:--:--  4483
100  186M  100  186M    0     0  14.3M      0  0:00:12  0:00:12 --:--:-- 16.5M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5093      0 --:--:-- --:--:-- --:--:--  5117
100  140M  100  140M    0     0  11.3M      0  0:00:12  0:00:12 --:--:-- 12.8M
  df = pd.read_csv(file, encoding="ascii")


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1085    0  1085    0     0   5117      0 --:--:-- --:--:-- --:--:--  5142
100 73.3M  100 73.3M    0     0  13.4M      0  0:00:05  0:00:05 --:--:-- 14.6M
  df = pd.read_csv(file, encoding="ascii")
