In [None]:
from pathlib import Path
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw_data"]
INTERMEDIATE_DIR = PATHS["intermediate_data"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"

# Read in CSV statement and fix the "Amount" column to convert parenthesized values to positive numbers for credit card accounts:

In [40]:
import re
import pandas as pd
import numpy as np

current_statement = "../path/to/credit-account-aggregated.csv"
df = pd.read_csv(current_statement)

def parse_amount(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float, np.number)):
        return -abs(float(x))
    s = str(x).strip().replace("$", "").replace(",", "")
    if s.startswith("(") and s.endswith(")"):
        inner = s[1:-1].strip()
        return abs(float(inner))
    return -abs(float(s))

df["Amount"] = df["Amount"].apply(parse_amount)
df.isna().sum()

df_cleaned = df.copy()
df_cleaned.head(10)


# Normalize the "Description" column to create a "Merchant name" column by lowercasing, removing dates and numbers, and collapsing whitespace:

In [43]:
def normalize_text(s: str) -> str: #for visa card normalization
    if pd.isna(s) or not str(s).strip():
        return ""
    s = str(s).lower()
    # remove common non-merchant phrases
    s = re.sub(r'(payments?\s*[-:]?\s*thank you|cash advance fee|cash advance|interest charge(?:-purchase|-cash)?|payment\s+thank you)', ' ', s)
    # remove leading vendor tokens like `sq*`, `pay*`, `tst*`, `www.`
    s = re.sub(r'^(?:sq\*|sq\W*|pay\*|pay\W*|tst\*|www\.|http\S+|web\S+)\s*', '', s)
    # remove 2-2 date tokens and long digit sequences (phones, ids)
    s = re.sub(r'\d{2}-\d{2}', ' ', s)
    s = re.sub(r'\d{3,}', ' ', s)
    # keep only letters and ampersand, collapse spaces
    s = re.sub(r'[^a-z&]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# apply to the cleaned dataframe so the output CSV contains the new column
df_cleaned['Merchant name'] = df['Description'].fillna('').apply(normalize_text)
df_cleaned.head(10)


df_cleaned.to_csv("../path/to/credit-account-aggregated-cleaned.csv", index=False)