In [None]:
from pathlib import Path
from datetime import timedelta
from zoneinfo import ZoneInfo
import pandas as pd
import numpy as np

TARGET_TZ = ZoneInfo("America/Indiana/Indianapolis")
EEG_TEXT_IS_UTC = True           
USE_DOMINANT_DAY = False        
YEAR_MIN, YEAR_MAX = 2025, 2025

HAR_BASE = Path("/home/jupyter-yin10/EEG_HAR/data/har_convert")
EEG_BASE = Path("/home/jupyter-yin10/EEG_HAR/NEW/data/EEG_25hz_clean")
OUT_BASE = Path("/home/jupyter-yin10/EEG_HAR/NEW/data/EEG_Clipped")
SUBJECTS = [f"s{i}" for i in range(1, 7)]
SAVE_FULL_IF_NO_OVERLAP = True

def safe_read_csv(path):
    try:
        return pd.read_csv(path, dtype=str, encoding="utf-8-sig", na_filter=False)
    except UnicodeDecodeError:
        return pd.read_csv(path, dtype=str, encoding="utf-8", na_filter=False)

def normalize_text_ts(s):
    s = s.astype(str)
    s = s.str.replace("\u00A0", " ", regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    no_sec = s.str.match(r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}\s+\d{1,2}:\d{2}$")
    s.loc[no_sec] = s.loc[no_sec] + ":00"
    return s

def parse_har_local(series):
    # HAR strings are local time
    s = normalize_text_ts(series)
    dt = pd.to_datetime(s, format="%Y/%m/%d %H:%M:%S", errors="coerce")
    m = dt.isna()
    if m.any():
        dt2 = pd.to_datetime(s[m], format="%Y-%m-%d %H:%M:%S", errors="coerce")
        dt.loc[m & dt2.notna()] = dt2
        m = dt.isna()
    if m.any():
        dt3 = pd.to_datetime(s[m], errors="coerce")
        dt.loc[m] = dt3
    dt = dt.dt.tz_localize(TARGET_TZ, nonexistent="NaT", ambiguous="NaT").dt.tz_convert(TARGET_TZ).dt.tz_localize(None)
    return dt

def parse_eeg_to_local(series):
    # EEG may be epoch or text. Epoch is UTC, text interpretation controlled by EEG_TEXT_IS_UTC
    s = normalize_text_ts(series)
    dt = pd.Series(pd.NaT, index=series.index, dtype="datetime64[ns]")

    # epoch first
    s_num = pd.to_numeric(series, errors="coerce")
    if s_num.notna().any():
        med = s_num.dropna().median()
        if med >= 1e12:
            dt_epoch = pd.to_datetime(s_num, unit="ms", utc=True, errors="coerce")
        else:
            dt_epoch = pd.to_datetime(s_num, unit="s", utc=True, errors="coerce")
        dt.loc[dt.isna()] = dt_epoch.dt.tz_convert(TARGET_TZ).dt.tz_localize(None)

    # text, format 1
    if dt.notna().mean() < 0.60:
        dt_txt = pd.to_datetime(s, format="%Y/%m/%d %H:%M:%S", errors="coerce")
        m = dt.isna() & dt_txt.notna()
        if EEG_TEXT_IS_UTC:
            dt.loc[m] = dt_txt.loc[m].dt.tz_localize("UTC").dt.tz_convert(TARGET_TZ).dt.tz_localize(None)
        else:
            dt.loc[m] = dt_txt.loc[m].dt.tz_localize(TARGET_TZ).dt.tz_convert(TARGET_TZ).dt.tz_localize(None)

    # text, format 2
    if dt.notna().mean() < 0.60:
        dt_txt2 = pd.to_datetime(s, format="%Y-%m-%d %H:%M:%S", errors="coerce")
        m = dt.isna() & dt_txt2.notna()
        if EEG_TEXT_IS_UTC:
            dt.loc[m] = dt_txt2.loc[m].dt.tz_localize("UTC").dt.tz_convert(TARGET_TZ).dt.tz_localize(None)
        else:
            dt.loc[m] = dt_txt2.loc[m].dt.tz_localize(TARGET_TZ).dt.tz_convert(TARGET_TZ).dt.tz_localize(None)

    # generic text
    if dt.notna().mean() < 0.60:
        dt_any = pd.to_datetime(s, errors="coerce")
        m = dt.isna() & dt_any.notna()
        if EEG_TEXT_IS_UTC:
            dt.loc[m] = dt_any.loc[m].dt.tz_localize("UTC").dt.tz_convert(TARGET_TZ).dt.tz_localize(None)
        else:
            dt.loc[m] = dt_any.loc[m].dt.tz_localize(TARGET_TZ).dt.tz_convert(TARGET_TZ).dt.tz_localize(None)

    return dt

def find_timestamp_column(df):
    if "timestamp" in df.columns:
        return "timestamp"
    best_col, best_rate = None, -1.0
    for col in df.columns:
        trial = pd.to_datetime(df[col], errors="coerce")
        rate = trial.notna().mean()
        if rate > best_rate:
            best_rate, best_col = rate, col
    return best_col

# ---------------------------
# Build windows from HAR
# ---------------------------
def build_windows_from_har(har_dir: Path):
    rows = []
    for p in sorted(har_dir.glob("*.csv")):
        if p.name.startswith("har_time_windows_"):
            continue
        df = safe_read_csv(p)
        if df.shape[1] == 0:
            continue
        dt = parse_har_local(df.iloc[:, 0])
        yr = (dt.dt.year >= YEAR_MIN) & (dt.dt.year <= YEAR_MAX)
        dt = dt[yr]
        if dt.empty:
            continue
        if USE_DOMINANT_DAY:
            dom = dt.dt.date.value_counts().idxmax()
            dt = dt[dt.dt.date == dom]
            if dt.empty:
                continue
        start_raw, end_raw = dt.min(), dt.max()
        rows.append({
            "window_id": len(rows) + 1,
            "har_file": p.name,
            "start": start_raw - timedelta(seconds=10),
            "end": end_raw + timedelta(seconds=10),
        })
    win_df = pd.DataFrame(rows, columns=["window_id", "har_file", "start", "end"])
    if win_df.empty:
        raise RuntimeError(f"No HAR windows built in {har_dir}")
    return win_df.sort_values(["start", "end"]).reset_index(drop=True)

# ---------------------------
# Diagnose and clip
# ---------------------------
def diagnose_and_clip(eeg_dir: Path, out_dir: Path, windows_df: pd.DataFrame, save_full_if_no_overlap: bool = True):
    out_dir.mkdir(parents=True, exist_ok=True)

    win = windows_df.copy()
    win["start"] = pd.to_datetime(win["start"])
    win["end"] = pd.to_datetime(win["end"])

    print("\nHAR windows, local time")
    for _, r in win.iterrows():
        print(f"  id {int(r.window_id)}, {r.start} to {r.end}")

    summary_rows = []
    for eeg_path in sorted(eeg_dir.glob("*.csv")):
        df = safe_read_csv(eeg_path)
        before = len(df)
        if df.empty:
            out = df.iloc[0:0]
            out.to_csv(out_dir / f"{eeg_path.stem}_filtered.csv", index=False, encoding="utf-8")
            summary_rows.append([eeg_path.name, "EMPTY INPUT", before, 0, 0.0])
            print(f"{eeg_path.name}: 0.0 percent in window, empty input")
            continue

        ts_col = "timestamp" if "timestamp" in df.columns else find_timestamp_column(df)
        if ts_col is None:
            if save_full_if_no_overlap:
                df.to_csv(out_dir / f"{eeg_path.stem}_no_overlap_full.csv", index=False, encoding="utf-8")
                summary_rows.append([eeg_path.name, "NO TIMESTAMP, WROTE FULL FILE", before, before, 100.0])
                print(f"{eeg_path.name}: 100.0 percent in window, no timestamp column, wrote full file")
            else:
                df.iloc[0:0].to_csv(out_dir / f"{eeg_path.stem}_filtered.csv", index=False, encoding="utf-8")
                summary_rows.append([eeg_path.name, "NO TIMESTAMP, WROTE EMPTY", before, 0, 0.0])
                print(f"{eeg_path.name}: 0.0 percent in window, no timestamp column, wrote empty")
            continue

        times_local = parse_eeg_to_local(df[ts_col])
        if times_local.notna().any():
            print(f"\nEEG file, {eeg_path.name}")
            print(f"  EEG local min, {times_local.min()}, EEG local max, {times_local.max()}")

        keep = pd.Series(False, index=df.index)
        for _, w in win.iterrows():
            in_w = (times_local >= w["start"]) & (times_local <= w["end"])
            if in_w.any():
                span = times_local[in_w]
                print(f"  Overlap with window {int(w.window_id)}, rows {int(in_w.sum())}, first {span.min()}, last {span.max()}")
            keep |= in_w

        if not keep.any():
            if save_full_if_no_overlap:
                out_name = f"{eeg_path.stem}_no_overlap_full.csv"
                df.to_csv(out_dir / out_name, index=False, encoding="utf-8")
                after = before
                pct = 100.0
                status = "NO OVERLAP, WROTE FULL FILE"
            else:
                out_name = f"{eeg_path.stem}_filtered.csv"
                df.iloc[0:0].to_csv(out_dir / out_name, index=False, encoding="utf-8")
                after = 0
                pct = 0.0
                status = "NO OVERLAP, WROTE EMPTY"
        else:
            filtered = df[keep].copy()
            out_name = f"{eeg_path.stem}_filtered.csv"
            filtered.to_csv(out_dir / out_name, index=False, encoding="utf-8")
            after = len(filtered)
            pct = round(after / max(before, 1) * 100.0, 2)
            status = "OK"

        summary_rows.append([eeg_path.name, status, before, after, pct])
        print(f"{eeg_path.name}: {pct} percent of EEG rows fall inside any HAR window")
        print(f"  Saved, {out_dir / out_name}")

    sum_df = pd.DataFrame(summary_rows, columns=["eeg_file", "status", "rows_before", "rows_after", "pct_in_window"])
    sum_csv = out_dir / "clip_summary.csv"
    sum_df.to_csv(sum_csv, index=False, encoding="utf-8")
    print(f"\n[OK] Summary saved, {sum_csv}")
    return sum_df

all_summaries = []

for subj in SUBJECTS:
    print("\n" + "=" * 70)
    print(f"Subject {subj}")
    print("=" * 70)

    har_dir = HAR_BASE / subj
    eeg_dir = EEG_BASE / subj
    out_dir = OUT_BASE / subj
    out_dir.mkdir(parents=True, exist_ok=True)

    print("Building windows from HAR")
    windows = build_windows_from_har(har_dir)

    print("Clipping EEG with those windows")
    summary = diagnose_and_clip(eeg_dir, out_dir, windows, save_full_if_no_overlap=SAVE_FULL_IF_NO_OVERLAP)

    # Tag with subject for later aggregation
    summary["subject"] = subj
    # Print a compact per subject table
    print("\nPer subject summary")
    print(summary[["subject", "eeg_file", "status", "rows_before", "rows_after", "pct_in_window"]]
          .sort_values(["subject", "eeg_file"])
          .to_string(index=False))

    all_summaries.append(summary)

# Combine all subjects and print a single table
combined = pd.concat(all_summaries, ignore_index=True)
combined_path = OUT_BASE / "all_subjects_clip_summary.csv"
combined.to_csv(combined_path, index=False, encoding="utf-8")

print("\n" + "#" * 70)
print("Combined summary for all subjects")
print("#" * 70)
print(combined[["subject", "eeg_file", "status", "rows_before", "rows_after", "pct_in_window"]]
      .sort_values(["subject", "eeg_file"])
      .to_string(index=False))

print(f"\nSaved combined summary to {combined_path}")
print("\nOutputs per subject are under:")
for subj in SUBJECTS:
    print(f"  {OUT_BASE / subj}  [filtered clips, no overlap full copies, clip_summary.csv]")
