In [25]:
from pathlib import Path

import pandas as pd
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw_data"]
INTERMEDIATE_DIR = PATHS["intermediate_data"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"

# Pull together and sort all CSV transaction data from multiple files in a folder to one aggregated CSV.

cell below finds all account subfolders in the raw_data folder that contain CSV files.

In [26]:
account_dirs = [
    p for p in RAW_DIR.iterdir()
    if p.is_dir() and any(p.glob("*.csv"))
]

if not account_dirs:
    raise RuntimeError("No account subfolders with CSV files found in raw_data folder")

In [27]:
def amount_to_num(x):
    # already numeric → just return it
    if isinstance(x, (int, float)):
        return float(x)

    # string → handle parentheses
    s = x.strip()
    s = str(x).strip().replace("$", "").replace(",", "")
    neg = s.startswith('(') and s.endswith(')')
    s = s.strip('()').replace(',', '')
    val = float(s)

    return -val if neg else val

def load_and_concat_csvs(account_dir: Path) -> pd.DataFrame:
    csv_files = sorted(account_dir.glob("*.csv"))

    if not csv_files:
        raise RuntimeError(f"No CSV files found in {account_dir.name}")

    df_list = []

    for csv_path in csv_files:
        df = pd.read_csv(csv_path)
        df["source_file"] = csv_path.name
        df_list.append(df)

    combined = pd.concat(df_list, ignore_index=True)
    combined["account"] = account_dir.name

    combined["Transaction Date"] = pd.to_datetime(
        combined["Transaction Date"],
        errors="coerce",
    )

    combined = combined.set_index("Transaction Date")
    combined = combined.drop(columns=["source_file", "account"])
    combined = combined.sort_index()
    combined["Amount"] = combined["Amount"].apply(amount_to_num)

    return combined


In [28]:
for account_dir in account_dirs:
    account_name = account_dir.name

    print(f"Processing account: {account_name}")
    df = load_and_concat_csvs(account_dir)

    output_path = INTERMEDIATE_DIR / f"{account_name}_aggregated.csv"
    df.to_csv(output_path, index=True)

Processing account: gramma-checking-1448
Processing account: gramma-checking-7528
Processing account: gramma-checking-9272
Processing account: gramma-readynow-2128
Processing account: gramma-readynow-7528
Processing account: gramma-visacard-1639


  combined["Transaction Date"] = pd.to_datetime(
