In [None]:
from pathlib import Path
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw_data"]
INTERMEDIATE_DIR = PATHS["intermediate_data"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"

## Create Merchant Spend Aggregated Across Year Table (per-account, checking-only):

In [None]:
current_statement = "../path/to/*.aggregated-cleaned-internals-removed.csv"

In [None]:
import re
import numpy as np
import pandas as pd
from typing import Optional
from pathlib import Path

df_cleaned = pd.read_csv(current_statement)

def amount_to_num(x):
    # already numeric → just return it
    if isinstance(x, (int, float)):
        return float(x)

    # string → handle parentheses
    s = x.strip()
    neg = s.startswith('(') and s.endswith(')')
    s = s.strip('()').replace(',', '')
    val = float(s)

    return -val if neg else val


# ensure numeric amount and datetime columns
df_cleaned['amount'] = df_cleaned['Amount'].apply(amount_to_num)
df_cleaned['Transaction Date'] = pd.to_datetime(df_cleaned['Transaction Date'], errors='coerce')

# group by merchant_norm and aggregate
gb = df_cleaned.groupby('Merchant name', dropna=False)
merchant_summary = gb.agg(
    txn_count=('amount', 'size'),
    total_amount=('amount', 'sum'),
    avg_amount=('amount', 'mean'),
    std_amount=('amount', 'std'),
    first_txn=('Transaction Date', 'min'),
    last_txn=('Transaction Date', 'max'),
)

# sort by frequency or total spend
merchant_summary = merchant_summary.sort_values(['txn_count', 'total_amount'], ascending=[False, False])

# optional: filter recurring merchants (e.g. >= 3 transactions)
recurring_merchants = merchant_summary[merchant_summary['txn_count'] >= 3]

merchant_summary.head(500)  # show top merchants

## Create Merchant Spend Per Month Pivot Table (per-account, checking-only):

In [None]:
# force transaction date to type datetime
df_cleaned['Transaction Date'] = pd.to_datetime(df_cleaned['Transaction Date'], errors='coerce')

# drop rows where Transaction Date is NaT/not a time
df_cleaned = df_cleaned.dropna(subset=['Transaction Date'])

# create the month column
df_cleaned['Month'] = df_cleaned['Transaction Date'].dt.to_period('M')
df_cleaned.head()

# group and pivot by month and merchant
monthly_summary = df_cleaned.groupby(['Merchant name', 'Month'], dropna=False).agg(
   monthly_total=('Amount', 'sum')
).reset_index()

monthly_pivot = monthly_summary.pivot(
    index='Merchant name',
    columns='Month',
    values='monthly_total'
).fillna(0)

# sort by total spend, ascending because spend is negative
monthly_pivot['Total Spend'] = monthly_pivot.sum(axis=1)
monthly_pivot = monthly_pivot.sort_values('Total Spend')

monthly_pivot.head(500)

# Ensure that output folder exists for sunburst charts

In [None]:
def ensure_path_exists(path: str, force_file: Optional[bool] = None) -> Path:
    """
    Ensure `path` exists. If `force_file` is True create a blank file.
    If False create a directory. If None, infer: treat as file when Path.suffix != ''.
    Returns the Path object.
    """
    p = Path(path)
    # decide file vs directory
    if force_file is None:
        is_file = bool(p.suffix)  # treat paths with an extension as files
    else:
        is_file = bool(force_file)

    if is_file:
        # create parent dirs first, then an empty file (touch)
        p.parent.mkdir(parents=True, exist_ok=True)
        p.touch(exist_ok=True)
    else:
        # create directory (including parents)
        p.mkdir(parents=True, exist_ok=True)

    return p

ensure_path_exists("../outputs/sunburst/", force_file=False)

## Build Top Spending Merchants Across Year Sunburst Chart (per-account, checking-only):

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from pathlib import Path

pio.renderers.default = "browser"

# load the same CSV used elsewhere in the notebook
df = pd.read_csv(
    current_statement,
    header=0,
    names=[
        'Transaction Date', 'Posted Date', 'Transaction Type', 'Check/Serial #', 'Full description',
        'Merchant name', 'Category name', 'Sub-category name', 'Amount', 'Daily Posted Balance'
    ],
    dtype=str
)

# preserve raw text and parse numeric amounts while respecting parentheses as negative
df['Amount_raw'] = df['Amount'].astype(str).fillna('')
paren_mask = df['Amount_raw'].str.match(r'^\s*\(.*\)\s*$', na=False)
cleaned = df['Amount_raw'].str.replace(r'[\$\s,]', '', regex=True).str.replace(r'[()]', '', regex=True)
df['Amount_num'] = pd.to_numeric(cleaned, errors='coerce')
df.loc[paren_mask, 'Amount_num'] = -df.loc[paren_mask, 'Amount_num']

# drop rows that couldn't be parsed and fill missing merchant names
df = df.dropna(subset=['Amount_num']).copy()
df['Merchant name'] = df['Merchant name'].fillna(df['Full description'].astype(str).str[:40])

# build mask: only negative amounts and exclude category 'income'
cat_lower = df['Category name'].astype(str).str.lower().fillna('')
chart_mask = (df['Amount_num'] < 0) & (cat_lower != 'income')

# explicit copy and safe assignment to avoid SettingWithCopyWarning
spend_df = df.loc[chart_mask].copy()
spend_df.loc[:, 'plot_amount'] = spend_df['Amount_num'].abs()


# build sunburst from the filtered data (expenses only)
fig = px.sunburst(
    spend_df,
    path=['Category name', 'Merchant name'],
    values='plot_amount',
    title="2026 Spending Breakdown: Categories and Merchants (expenses only)",
    color='Category name',
    template='plotly_dark',
    color_discrete_sequence=px.colors.sequential.Turbo
)
fig.update_layout(
    margin=dict(t=100, l=100, r=100, b=100),
    font=dict(size=24),
)
fig.update_traces(
    insidetextorientation='radial',
    texttemplate='%{label}<br>%{value}',# show label + value inside slices
    insidetextfont=dict(size=18)
)

fig.show()
fig.write_html("../outputs/sunburst/checking-merchant-spend.html") # replace the path as desired