# ECG Folder Processing â€” v2 (RR-interval anomaly layer)

Reads v1 output (`*_rich_processed_beats.csv`) and adds RR-interval timing analysis.
Saves results to `*_v2_processed_beats.csv` so v1 is never modified.

**New columns added:**
- `rr_baseline` â€” rolling median RR of preceding 7 beats (seconds)
- `rr_flag` â€” `premature` | `compensatory` | `missed_beat` | `` (empty = normal timing)
- `rr_burden_pct` â€” % of premature beats in this 20-s segment

**Comparison colour coding in the viewer:**
- ðŸŸ¢ Green dot â€” v1=normal, v2 timing normal
- ðŸŸ¡ Yellow dot â€” v1=normal **but** v2 flagged as `premature` (v1 missed it)
- ðŸ”´ Red dot â€” v1=PVC or VT
- ðŸ”µ Blue shading â€” `compensatory` pause zone
- ðŸŸ  Orange shading â€” `missed_beat` zone

In [None]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from ipywidgets import Button, HBox, Output, Dropdown, HTML
from IPython.display import display, clear_output

sys.path.insert(0, '.')
import ecg_library
from ecg_library.rr_analysis import analyze_rr_anomalies
from ecg_library.filters import filter_signal

DATA_DIR = 'Data'
FS = 125
V1_SUFFIX = '_rich_processed_beats.csv'
V2_SUFFIX = '_v2_processed_beats.csv'

print('ecg_library loaded.')

## 1. Process sessions â€” add RR analysis on top of v1 output

In [None]:
v1_files = glob.glob(os.path.join(DATA_DIR, '**', f'*{V1_SUFFIX}'), recursive=True)
print(f'Found {len(v1_files)} v1 session(s).')

for v1_path in sorted(v1_files):
    session_dir = os.path.dirname(v1_path)
    subfolder   = os.path.basename(session_dir)
    v2_path     = os.path.join(session_dir, f'{subfolder}{V2_SUFFIX}')

    if os.path.exists(v2_path):
        print(f'  [skip] {subfolder}')
        continue

    print(f'  [processing] {subfolder} ...', end=' ', flush=True)
    try:
        df = pd.read_csv(v1_path)
        df_v2 = analyze_rr_anomalies(df)
        df_v2.to_csv(v2_path, index=False)
        n_premature = (df_v2['rr_flag'] == 'premature').sum()
        n_missed    = (df_v2['rr_flag'] == 'missed_beat').sum()
        print(f'done  ({n_premature} premature, {n_missed} missed_beat)')
    except Exception as e:
        print(f'ERROR: {e}')

print('\nAll sessions processed.')

## 2. Compare v1 vs v2 â€” interactive segment viewer

In [None]:
# â”€â”€ session selector â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
v2_files   = sorted(glob.glob(os.path.join(DATA_DIR, '**', f'*{V2_SUFFIX}'), recursive=True))
subfolders = [os.path.basename(os.path.dirname(p)) for p in v2_files]

if not subfolders:
    print('No v2 files found â€” run the processing cell above first.')
else:
    # Default to IndoorCycling if present, otherwise first session
    default = next((s for s in subfolders if 'IndoorCycling' in s), subfolders[0])

    session_dd = Dropdown(options=subfolders, value=default, description='Session:')
    out        = Output()

    state = {'df': None, 'files': [], 'idx': 0}

    def _numeric_key(fname):
        return int(''.join(filter(str.isdigit, fname.split('.')[0])))

    def load_session(subfolder):
        v2_path = os.path.join(DATA_DIR, subfolder, f'{subfolder}{V2_SUFFIX}')
        df      = pd.read_csv(v2_path)
        files   = sorted(df['filename'].unique(), key=_numeric_key)
        # skip ecg_0.csv (metadata)
        files   = [f for f in files if f != 'ecg_0.csv']
        state['df']    = df
        state['files'] = files
        state['idx']   = 0

    def load_ecg(subfolder, fname):
        path = os.path.join(DATA_DIR, subfolder, fname)
        raw  = pd.read_csv(path, header=None, comment='#')[0].values.astype(float)
        return filter_signal(raw, FS)

    def plot_segment():
        with out:
            clear_output(wait=True)
            subfolder = session_dd.value
            df        = state['df']
            files     = state['files']
            idx       = state['idx']

            if not files:
                print('No segments found.')
                return

            fname  = files[idx]
            seg_df = df[df['filename'] == fname].copy()
            sig    = load_ecg(subfolder, fname)
            t      = np.arange(len(sig)) / FS

            # â”€â”€ stats for title â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
            n_v1_anom   = (seg_df['beat_type'].isin(['PVC', 'VT'])).sum()
            n_premature = (seg_df['rr_flag'] == 'premature').sum()
            n_v2_new    = ((seg_df['rr_flag'] == 'premature') &
                           (~seg_df['beat_type'].isin(['PVC', 'VT']))).sum()
            burden      = seg_df['rr_burden_pct'].iloc[0] if len(seg_df) > 0 else 0.0

            fig, (ax_ecg, ax_rr) = plt.subplots(
                2, 1, figsize=(16, 7),
                gridspec_kw={'height_ratios': [3, 1]},
                sharex=True,
            )

            # â”€â”€ ECG trace â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
            ax_ecg.plot(t, sig, 'k-', lw=0.7, zorder=1)

            for _, beat in seg_df.iterrows():
                r_local = int(beat['r_index']) % 2500
                r_t     = r_local / FS
                r_amp   = beat['r_amplitude']
                flag    = beat['rr_flag']
                btype   = beat['beat_type']

                # shaded zones
                if flag == 'compensatory' and np.isfinite(beat['rr_baseline']):
                    zone_start = r_t - float(beat['rr_interval'])
                    ax_ecg.axvspan(max(zone_start, 0), r_t,
                                   color='steelblue', alpha=0.12, zorder=0)
                elif flag == 'missed_beat':
                    zone_start = r_t - float(beat['rr_interval'])
                    ax_ecg.axvspan(max(zone_start, 0), r_t,
                                   color='orange', alpha=0.18, zorder=0)

                # beat dots
                plot_amp = min(float(r_amp), 950)
                if btype in ('PVC', 'VT'):
                    color, marker, zorder = 'red', 'o', 4
                    ax_ecg.annotate(btype, (r_t, plot_amp + 30),
                                    color='red', fontsize=9, fontweight='bold', zorder=5)
                elif flag == 'premature':
                    # v1 missed it â€” v2 caught it by timing alone
                    color, marker, zorder = 'gold', 'D', 4
                    ax_ecg.annotate('?premature', (r_t, plot_amp + 30),
                                    color='goldenrod', fontsize=8, zorder=5)
                else:
                    color, marker, zorder = 'limegreen', 'o', 3

                ax_ecg.plot(r_t, plot_amp, marker=marker, color=color,
                            ms=6, zorder=zorder)

            ax_ecg.set_ylabel('Amplitude')
            ax_ecg.set_title(
                f'{subfolder}  |  {fname}  ({idx+1}/{len(files)})\n'
                f'v1: {n_v1_anom} PVC/VT    '
                f'v2 new (timing only): {n_v2_new}    '
                f'Burden: {burden:.1f}%',
                fontsize=10,
            )
            ax_ecg.grid(True, alpha=0.25)

            # legend
            ax_ecg.legend(handles=[
                mpatches.Patch(color='limegreen', label='Normal (v1+v2 agree)'),
                mpatches.Patch(color='red',       label='PVC / VT (v1 morphology)'),
                mpatches.Patch(color='gold',      label='? premature (v2 RR only)'),
                mpatches.Patch(color='steelblue', alpha=0.4, label='Compensatory pause zone'),
                mpatches.Patch(color='orange',    alpha=0.4, label='Missed beat zone'),
            ], loc='upper right', fontsize=8)

            # â”€â”€ RR interval strip â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
            rr_vals      = pd.to_numeric(seg_df['rr_interval'],   errors='coerce').values
            rr_base_vals = pd.to_numeric(seg_df['rr_baseline'],   errors='coerce').values
            r_times      = [(int(r) % 2500) / FS for r in seg_df['r_index']]
            flags_arr    = seg_df['rr_flag'].values

            ax_rr.plot(r_times, rr_vals, 'k.-', lw=0.8, ms=4, label='RR interval')
            ax_rr.plot(r_times, rr_base_vals, 'b--', lw=1, alpha=0.6, label='Baseline (rolling median)')

            # colour-code individual RR points
            for i, (rt, rr, flag) in enumerate(zip(r_times, rr_vals, flags_arr)):
                if not np.isfinite(rr):
                    continue
                dot_color = {
                    'premature':     'red',
                    'compensatory':  'steelblue',
                    'missed_beat':   'orange',
                }.get(flag, None)
                if dot_color:
                    ax_rr.plot(rt, rr, 'o', color=dot_color, ms=7, zorder=4)

            ax_rr.set_ylabel('RR (s)')
            ax_rr.set_xlabel('Time (s)')
            ax_rr.legend(fontsize=8, loc='upper right')
            ax_rr.grid(True, alpha=0.25)

            plt.tight_layout()
            plt.show()

    # â”€â”€ navigation â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    prev_btn = Button(description='â—€ Prev')
    next_btn = Button(description='Next â–¶')

    def on_prev(_):
        state['idx'] = max(0, state['idx'] - 1)
        plot_segment()

    def on_next(_):
        state['idx'] = min(len(state['files']) - 1, state['idx'] + 1)
        plot_segment()

    def on_session_change(change):
        load_session(change['new'])
        plot_segment()

    prev_btn.on_click(on_prev)
    next_btn.on_click(on_next)
    session_dd.observe(on_session_change, names='value')

    load_session(default)
    plot_segment()
    display(HBox([prev_btn, next_btn, session_dd]), out)

## 3. Session-level burden summary

In [None]:
# Summarise v1 vs v2 anomaly counts across all processed sessions
rows = []
for v2_path in sorted(glob.glob(os.path.join(DATA_DIR, '**', f'*{V2_SUFFIX}'), recursive=True)):
    subfolder = os.path.basename(os.path.dirname(v2_path))
    df = pd.read_csv(v2_path)

    total       = len(df)
    v1_pvc_vt   = df['beat_type'].isin(['PVC', 'VT']).sum()
    v2_premature= (df['rr_flag'] == 'premature').sum()
    v2_new      = ((df['rr_flag'] == 'premature') & (~df['beat_type'].isin(['PVC','VT']))).sum()
    v2_missed   = (df['rr_flag'] == 'missed_beat').sum()
    v1_burden   = round(v1_pvc_vt / total * 100, 2) if total > 0 else 0
    v2_burden   = round(v2_premature / total * 100, 2) if total > 0 else 0

    rows.append({
        'session':      subfolder,
        'total_beats':  total,
        'v1_PVC_VT':    v1_pvc_vt,
        'v1_burden%':   v1_burden,
        'v2_premature': v2_premature,
        'v2_new_only':  v2_new,
        'v2_missed_beat': v2_missed,
        'v2_burden%':   v2_burden,
    })

summary = pd.DataFrame(rows)
summary = summary.sort_values('v2_burden%', ascending=False).reset_index(drop=True)
pd.set_option('display.max_colwidth', 60)
pd.set_option('display.width', 200)
display(summary)