In [1]:
from pathlib import Path
import yaml

CONFIG_PATH = Path("../config/config.yaml").resolve()
PROJECT_ROOT = CONFIG_PATH.parent.parent

with open(CONFIG_PATH) as f:
    cfg = yaml.safe_load(f)

PATHS = {
    k: PROJECT_ROOT / v
    for k, v in cfg["paths"].items()
}

RAW_DIR = PATHS["raw_data"]
INTERMEDIATE_DIR = PATHS["intermediate_data"]

assert RAW_DIR.exists(), "Run step-0-init-project.ipynb first"

# SCRIPT TO REMOVE INTERNAL TRANSFERS:

In [2]:
import re
import pandas as pd
from datetime import timedelta
import os

TRANSFER_KEYWORDS = [
    "MOBILE TO", "MOBILE FROM", "ONLINE TO", "ONLINE FROM", "TRANSFER TO", "TRANSFER FROM",
    "IN-BRANCH TRANSFER", "PREAUTHORIZED TRANSFER", "TRANSFER", "WIRE REF", "WIRE TRANSFER",
    "TRANSFER TO CHECKING", "TRANSFER TO SAVINGS"
]

MASK_RE = re.compile(r'(\*{2,}\d{2,})')

def normalize_desc(s):
    if pd.isna(s): return ""
    return re.sub(r'\s+', ' ', str(s).upper()).strip()

def extract_mask(s):
    m = MASK_RE.search(s or "")
    return m.group(1) if m else None

def pair_internal_between_files(path_a: str, path_b: str, window_days: int = 3, amount_tolerance: float = 0.01):
    df_a = pd.read_csv(path_a)
    df_b = pd.read_csv(path_b)

    # track original columns to clean up later
    cols_a = df_a.columns.tolist()
    cols_b = df_b.columns.tolist()

    df_a['source'] = 'A'
    df_b['source'] = 'B'
    df = pd.concat([df_a, df_b], ignore_index=True)

    # normalize helper columns
    df['desc_norm'] = df.get('Full description', df.get('Description', '')).apply(normalize_desc)
    df['mask'] = df['desc_norm'].apply(extract_mask)
    df['abs_amount'] = df['Amount'].abs().round(2)
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
    df['Candidate'] = df['desc_norm'].apply(lambda d: any(k in d for k in TRANSFER_KEYWORDS))
    df['is_internal'] = False
    df['internal_id'] = pd.NA

    candidates = df[df['Candidate']].sort_values('Transaction Date').index.tolist()
    used = set()
    next_id = 1

    for idx in candidates:
        if idx in used: continue
        row = df.loc[idx]
        target_amount = row['abs_amount']
        target_sign = row['Amount'] >= 0
        window_start = row['Transaction Date'] - timedelta(days=window_days)
        window_end = row['Transaction Date'] + timedelta(days=window_days)

        pool = df.loc[
            (df.index != idx) & (~df.index.isin(used)) &
            (df['abs_amount'].sub(target_amount).abs() <= amount_tolerance) &
            (df['Transaction Date'] >= window_start) & (df['Transaction Date'] <= window_end) &
            ((df['Amount'] >= 0) != target_sign) & (df['source'] != row['source'])
        ]

        partner_idx = None
        if pd.notna(row['mask']):
            pm = pool[pool['mask'] == row['mask']]
            if not pm.empty: partner_idx = pm.index[0]

        if partner_idx is None and not pool.empty:
            partner_idx = pool.index[0]

        if partner_idx is not None:
            df.at[idx, 'is_internal'] = True
            df.at[partner_idx, 'is_internal'] = True
            df.at[idx, 'internal_id'] = next_id
            df.at[partner_idx, 'internal_id'] = next_id
            used.add(idx); used.add(partner_idx)
            next_id += 1

    # cleanup and save logic
    def save_filtered(source_label, original_path, original_cols):
        # filter for this specific file and ensure they aren't internal
        filtered = df[(df['source'] == source_label) & (df['is_internal'] == False)]
        # restore original column structure
        filtered = filtered[original_cols]

        # generate new filename
        base, ext = os.path.splitext(original_path)
        new_path = f"{base}-internals-removed{ext}"
        filtered.to_csv(new_path, index=False)
        return new_path

    path_out_a = save_filtered('A', path_a, cols_a)
    path_out_b = save_filtered('B', path_b, cols_b)

    print(f"Removed {next_id - 1} internal transfer pairs.")
    print(f"Saved: {path_out_a}\nSaved: {path_out_b}")

    return df

In [3]:
pair_internal_between_files('../path/to/primary/checking/with/internal/transfers.csv', '../path/to/secondary/checking/with/internal/transfers.csv', window_days=5)


FileNotFoundError: [Errno 2] No such file or directory: '../path/to/primary/checking/with/internal/transfers.csv'