In [1]:
from pathlib import Path
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw"]
INTERMEDIATE_DIR = PATHS["intermediate"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"


KeyError: 'raw'

# Pull together and sort all CSV transaction data from multiple files in a folder to one aggregated CSV

In [None]:
import pathlib

import pandas as pd

def find_csv_files(folder: pathlib.Path):
    return sorted([p for p in folder.glob("*.csv")])

def list_account_dirs(root: pathlib.Path):
    # return list of (account_name, path) for immediate subfolders that contain CSV files
    # or the root itself if it contains the CSVs
    accounts = []
    for p in sorted(root.iterdir()):
        if p.is_dir() and any(p.glob("*.csv")):
            accounts.append((p.name, p))
    if not accounts and any(root.glob("*.csv")):
        accounts.append((root.name or "account", root))
    return accounts

def read_csv_robust(path: pathlib.Path):
    try:
        return pd.read_csv(path)
    except Exception:
        for sep in [",", ";", "\t"]:
            try:
                return pd.read_csv(path, sep=sep)
            except Exception:
                continue
    return pd.read_csv(path, engine="python", encoding_errors="ignore")

# Write out aggregated and sorted CSV for one account folder:

In [None]:
from IPython.display import HTML, display

current_statement = "../path/to/csv/folder"
folder = pathlib.Path(current_statement)
files = find_csv_files(folder)

if not files:
    print(f"No CSV files for account in {folder}")

dfs = [read_csv_robust(f) for f in files]
df = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(df)} rows from {len(files)} files in {folder}")

display(HTML(df.to_html()))

df_agg = df.copy()

df_agg["Transaction Date"] = pd.to_datetime(df_agg["Transaction Date"], errors="coerce")

df_agg = df_agg.set_index("Transaction Date")

df_agg = df_agg.sort_index()

df_agg.to_csv(current_statement + "-aggregated.csv", index=True, index_label="Transaction Date")
