In [54]:
from pathlib import Path
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw_data"]
INTERMEDIATE_DIR = PATHS["intermediate_data"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"

In [55]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parents[0]  # adjust if notebook depth differs
RAW_DIR = PROJECT_ROOT / "data" / "intermediate"
OUT_DIR = PROJECT_ROOT / "data" / "intermediate" / "reconciled"
print(f"Raw data dir: {RAW_DIR}")
print(f"Output dir: {OUT_DIR}")
print(f"Project root: {PROJECT_ROOT}")

OUT_DIR.mkdir(parents=True, exist_ok=True)

checking_files = sorted([
    f for f in RAW_DIR.glob("*.csv")
    if "checking" in f.name.lower()
])

assert checking_files, "No checking CSV files found"


Raw data dir: /home/smoke/PycharmProjects/gramma-banking/data/intermediate
Output dir: /home/smoke/PycharmProjects/gramma-banking/data/intermediate/reconciled
Project root: /home/smoke/PycharmProjects/gramma-banking


In [56]:
dfs = {}

for f in checking_files:
    df = pd.read_csv(f)

    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])

    # REQUIRED metadata
    df["_source_file"] = f.name
    df["_account_id"] = f.stem
    df["_row_id"] = range(len(df))   # <-- THIS is what youâ€™re missing

    dfs[f.name] = df


In [57]:
combined = pd.concat(dfs.values(), ignore_index=True)

# Safety: never touch amounts
assert (combined["Amount"] != 0).all(), "Zeroed amounts detected (should not happen)"


In [58]:
combined["abs_amount"] = combined["Amount"].abs()

pairs = combined.merge(
    combined,
    on="abs_amount",
    suffixes=("_a", "_b")
)

pairs = pairs[
    (pairs["Amount_a"] == -pairs["Amount_b"]) &
    (pairs["_source_file_a"] != pairs["_source_file_b"]) &
    (abs((pairs["Transaction Date_a"] - pairs["Transaction Date_b"]).dt.days) <= 1)
]
print(combined.columns.tolist())



['Transaction Date', 'Posted Date', 'Transaction Type', 'Check/Serial #', 'Full description', 'Merchant name', 'Category name', 'Sub-category name', 'Amount', 'Daily Posted Balance', '_source_file', '_account_id', '_row_id', 'abs_amount']


In [59]:
pairs = pairs[
    (pairs["_source_file_a"] < pairs["_source_file_b"]) |
    (
        (pairs["_source_file_a"] == pairs["_source_file_b"]) &
        (pairs["_row_id_a"] < pairs["_row_id_b"])
    )
]


In [60]:
rows_to_drop = {name: set() for name in dfs}

for _, r in pairs.iterrows():
    rows_to_drop[r["_source_file_a"]].add(r["_row_id_a"])
    rows_to_drop[r["_source_file_b"]].add(r["_row_id_b"])


In [61]:
reconciled = {}

for name, df in dfs.items():
    original_len = len(df)

    drop_ids = rows_to_drop.get(name, set())
    df_new = df[~df["_row_id"].isin(drop_ids)].copy()

    # SANITY CHECKS
    assert len(df_new) <= original_len
    assert (df_new["Amount"] != 0).all()
    assert "Transaction Date" in df_new.columns

    reconciled[name] = df_new


In [62]:
for fname, df in reconciled.items():
    out_name = fname.replace(".csv", "-internals-removed.csv")
    out_path = OUT_DIR / out_name

    df.drop(columns=["_source_file", "_account_id", "_row_id"], errors="ignore") \
      .to_csv(out_path, index=False)


In [66]:
summary = []

for name in dfs:
    summary.append({
        "file": name,
        "original_rows": dfs[name],
        "final_rows": reconciled[name],
        "removed": len(dfs[name]) - len(reconciled[name])
    })

def diff_summary():
    old_df = pd.read_csv("/home/smoke/Documents/MOVE THESE BACK/gramma-checking-7528-aggregated-cleaned-internals-removed.csv")
    new_df = pd.read_csv("/home/smoke/PycharmProjects/gramma-banking/data/intermediate/reconciled/gramma-checking-7528_aggregated-internals-removed.csv")

    print(new_df.columns)

    cols = ["Transaction Date", "Amount", "Full description"]

    removed = (
        old_df
        .merge(new_df[cols], on=cols, how="left", indicator=True)
        .query('_merge == "left_only"')
    )

    added = (
        new_df
        .merge(old_df[cols], on=cols, how="left", indicator=True)
        .query('_merge == "left_only"')
    )


    return {
        "old_rows": len(old_df),
        "new_rows": len(new_df),
        "removed": len(old_df) - len(new_df),
        "old_sum": old_df["Amount"].sum(),
        "new_sum": new_df["Amount"].sum(),
        "sum_delta": old_df["Amount"].sum() - new_df["Amount"].sum()
    }

summarized = diff_summary()
print(summarized)

Index(['Transaction Date', 'Posted Date', 'Transaction Type', 'Check/Serial #',
       'Full description', 'Merchant name', 'Category name',
       'Sub-category name', 'Amount', 'Daily Posted Balance'],
      dtype='object')
OLD COLUMNS:
['Transaction Date', 'Posted Date', 'Transaction Type', 'Check/Serial #', 'Full description', 'Merchant name', 'Category name', 'Sub-category name', 'Amount', 'Daily Posted Balance']

NEW COLUMNS:
['Transaction Date', 'Posted Date', 'Transaction Type', 'Check/Serial #', 'Full description', 'Merchant name', 'Category name', 'Sub-category name', 'Amount', 'Daily Posted Balance']
{'old_rows': 1434, 'new_rows': 1447, 'removed': -13, 'old_sum': np.float64(-69253.31999999999), 'new_sum': np.float64(-63553.32), 'sum_delta': np.float64(-5699.999999999993)}
