In [4]:
print('IT IS RUNNING!')
"""
What this script does (UPDATED):
1. Load MULTIPLE CSVs with patient events (data_0.csv ... data_4.csv), tag each row with its file of origin (source_file),
   and combine into one dataframe.  #CHANGES MADE#
2. For each patient:
   - Find clusters of activity days where several readings are taken in conjunction with each other.
   - Build index days outside clusters after a sparse gap.
   - Compute basic features (last value, days since last, MEDIAN, IQR, count) for 4 codes over lookback window.  #CHANGES MADE#
   - Label: cluster start between days (EXCLUSION_BEFORE_CLUSTER_DAYS+1)..PREDICTION_HORIZON_DAYS after index (default day 8..90).
3. Combine all indices into one dataset.
4. Train/evaluate Random Forest using 5-fold "leave-one-CSV-out" CV (train 5 times),
   BUT only print ONE final combined set of metrics using pooled out-of-fold predictions (no per-fold AUROC prints).  #CHANGES MADE#
"""

import argparse
from pathlib import Path
import sys
import math

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, classification_report
from sklearn.impute import SimpleImputer


# DEFAULT SETTINGS
DEFAULT_CSV_PATHS = ["/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_0.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_1.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_2.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_3.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_4.csv"]

DEFAULT_TEST_SIZE = 0.2  # kept for compatibility (not used as a single split in this CV script)  #CHANGES MADE#
DEFAULT_LOOKBACK_DAYS = 90
DEFAULT_SPARSE_GAP_DAYS = 6
DEFAULT_DENSE_GAP_DAYS = 2
DEFAULT_MIN_CLUSTER_DAYS = 4
DEFAULT_CLUSTER_FROM_TABLE = "measurement"
EXCLUSION_BEFORE_CLUSTER_DAYS = 7
PREDICTION_HORIZON_DAYS = 90

CODE_SBP = "LOINC/8480-6"  # systolic BP
CODE_HR = "LOINC/8867-4"  # heart rate
CODE_GLUCOSE = "SNOMED/271649006"  # glucose levels
CODE_HBA1C = "SNOMED/271650006"  # HBA1c levels

EXPECTED_COLUMNS = [
    "subject_id", "time", "code", "numeric_value", "care_site_id", "clarity_table",
    "end", "note_id", "provider_id", "table", "text_value", "unit", "visit_id"
]


# FUNCTIONS

#Parses mixed-format datetime strings in a pandas Series into datetimes.
def parse_mixed_datetime(series):
    series = series.astype("string")
    mask_slash = series.str.contains("/", na=False)
    parsed_slash = pd.to_datetime(series.where(mask_slash), format="%d/%m/%Y %H:%M", errors="coerce")
    parsed_other = pd.to_datetime(series.where(~mask_slash), errors="coerce", dayfirst=False)
    return parsed_slash.fillna(parsed_other)


#Loads a CSV, cleans and parses time and numeric fields, and sorts records by subject and time.
def load_csv_simple(path):
    print("Loading CSV from", path)
    df = pd.read_csv(path, sep=",", encoding="utf-8-sig", low_memory=False)
    df["time"] = parse_mixed_datetime(df["time"])
    df["numeric_value"] = pd.to_numeric(df.get("numeric_value"), errors="coerce")
    df = df.sort_values(["subject_id", "time"], kind="mergesort").reset_index(drop=True)
    return df


#Identifies clusters of consecutive days with observations for a patient based on gap and minimum-length rules.
def find_clusters_for_patient(patient_df, sparse_gap_days, dense_gap_days, min_cluster_days, cluster_from_table):
    # Optionally restrict clustering to a specific source table; fall back if empty
    if cluster_from_table is not None and "table" in patient_df.columns:
        chosen = patient_df[(patient_df["table"] == cluster_from_table) & (~patient_df["time"].isna())].copy()
        if chosen.empty:
            chosen = patient_df.dropna(subset=["time"]).copy()
    else:
        # Use all non-null time observations
        chosen = patient_df.dropna(subset=["time"]).copy()

    # No valid timestamps → no clusters
    if chosen.empty:
        return []

    # Collapse timestamps to unique calendar days
    chosen["day"] = chosen["time"].dt.normalize()
    unique_days = sorted(chosen["day"].unique().tolist())

    clusters = []

    # Not enough days to form a cluster
    if len(unique_days) < min_cluster_days:
        return clusters

    # Compute gaps (in days) between consecutive observation days
    day_diffs = []
    for i in range(1, len(unique_days)):
        gap_days = (unique_days[i] - unique_days[i - 1]).days
        day_diffs.append(gap_days)

    # Scan for dense runs separated by gaps larger than dense_gap_days
    run_start_index = 0
    for i, gap in enumerate(day_diffs, start=1):
        if gap <= dense_gap_days:
            continue

        # End current run when gap exceeds dense threshold
        run_end_index = i - 1
        run_length = (run_end_index - run_start_index + 1)

        # Accept run only if long enough and preceded by a sufficiently sparse gap
        if run_length >= min_cluster_days:
            preceding_gap = math.inf if run_start_index == 0 else day_diffs[run_start_index - 1]
            if preceding_gap >= sparse_gap_days:
                start_day = unique_days[run_start_index]
                end_day = unique_days[run_end_index]
                clusters.append((start_day.normalize(), end_day.normalize(), run_length))

        run_start_index = i

    # Handle final run after loop
    run_end_index = len(unique_days) - 1
    run_length = (run_end_index - run_start_index + 1)
    if run_length >= min_cluster_days:
        preceding_gap = math.inf if run_start_index == 0 else day_diffs[run_start_index - 1]
        if preceding_gap >= sparse_gap_days:
            start_day = unique_days[run_start_index]
            end_day = unique_days[run_end_index]
            clusters.append((start_day.normalize(), end_day.normalize(), run_length))

    # Return list of (cluster_start_day, cluster_end_day, number_of_days)
    return clusters


# “index” timestamps for a patient: days outside any cluster after a sufficiently long quiet gap
# Now also returns the source_file for that index day (so we can do leave-one-CSV-out CV).  #CHANGES MADE#
def build_index_days(patient_df, clusters, sparse_gap_days):
    # Keep only rows with valid times; if none, no index days
    df = patient_df.dropna(subset=["time"]).copy()
    if df.empty:
        return []
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    df = df.dropna(subset=["time"])

    # Work at day granularity
    df["day"] = df["time"].dt.normalize()

    # Build a set of all calendar days covered by any cluster (inclusive)
    cluster_day_set = {
        pd.Timestamp(d).normalize()
        for start_day, end_day, _ in clusters
        for d in pd.date_range(start_day, end_day, freq="D")
    }

    # Iterate through unique observed days in time order
    day_rows = df[["day"]].drop_duplicates().sort_values("day").reset_index(drop=True)

    index_list, prev_day = [], None
    sid = patient_df["subject_id"].iloc[0]

    for i in range(len(day_rows)):
        current_day = day_rows.loc[i, "day"]
        in_cluster = current_day in cluster_day_set
        gap_days = math.inf if prev_day is None else (current_day - prev_day).days

        # Select "index" days: not in any cluster and preceded by a sufficiently long quiet gap
        if (not in_cluster) and gap_days >= sparse_gap_days:
            # Use the earliest timestamp on that day as the index time
            first_time = df.loc[df["day"] == current_day, "time"].min()
            # get the file tag for the row at (current_day, first_time)  #CHANGES MADE#
            # Also capture the source file for that (day, first_time) record, if available
            if "source_file" in df.columns:
                src_candidates = df.loc[(df["day"] == current_day) & (df["time"] == first_time), "source_file"]
                src_file = src_candidates.iloc[0] if len(src_candidates) > 0 else "unknown"
            else:
                src_file = "unknown"
            # Return tuple: (subject_id, index_time, index_day, source_file)
            index_list.append((sid, first_time, current_day, src_file))  #CHANGES MADE#
        prev_day = current_day

    return index_list


# Features: last, days_since_last, median, IQR, count  #CHANGES MADE#
def build_features_for_index(patient_df, index_time, lookback_days):
    # Build feature dict for a single index time using measurement data in a lookback window
    result = {}
    meas_df = patient_df.copy()

    # Restrict to measurement table if present
    if "table" in meas_df.columns:
        meas_df = meas_df[meas_df["table"] == "measurement"].copy()

    lookback_start = index_time - pd.Timedelta(days=lookback_days)
    codes = [CODE_SBP, CODE_HR, CODE_GLUCOSE, CODE_HBA1C]

    for code_val in codes:
        # Prefix for feature names (make code safe for column names)
        col_prefix = code_val.replace("/", "_")

        # Subset to this code with valid timestamps
        code_df = meas_df[(meas_df["code"] == code_val) & (~meas_df["time"].isna())].copy()

        # "Last" value at or before index_time, plus days since it occurred
        last_df = code_df[code_df["time"] <= index_time].sort_values("time")
        if last_df.empty:
            last_val = np.nan
            last_days_since = np.nan
        else:
            last_row = last_df.iloc[-1]
            last_val = last_row["numeric_value"]
            last_days_since = (index_time - last_row["time"]).days

        # Lookback window stats within [lookback_start, index_time]
        window_df = code_df[(code_df["time"] >= lookback_start) & (code_df["time"] <= index_time)]
        window_vals = window_df["numeric_value"].dropna()

        # If no values in window, return NaNs and count=0
        if window_vals.empty:
            window_median = np.nan
            window_iqr = np.nan
            window_count = 0
        else:
            window_median = float(window_vals.median())
            window_count = int(len(window_vals))
            # IQR stability rule: compute only when there are enough points  #CHANGES MADE#
            # IQR is only computed when there are enough points to be meaningful
            if window_count >= 4:
                q1 = float(window_vals.quantile(0.25))
                q3 = float(window_vals.quantile(0.75))
                window_iqr = q3 - q1
            else:
                window_iqr = np.nan

        # Store features for this code
        result[f"{col_prefix}_last"] = last_val
        result[f"{col_prefix}_days_since_last"] = last_days_since
        result[f"{col_prefix}_median_{lookback_days}d"] = window_median
        result[f"{col_prefix}_iqr_{lookback_days}d"] = window_iqr
        result[f"{col_prefix}_count_{lookback_days}d"] = window_count

    return result


#Assign a binary label indicating whether a future cluster begins within the prediction window after the index time.
def label_index_row(index_time, clusters):
    # No clusters means no positive label
    if not clusters:
        return 0
    # Define the prediction window: start after an exclusion buffer, end at the prediction horizon
    horizon_start = (index_time + pd.Timedelta(days=EXCLUSION_BEFORE_CLUSTER_DAYS + 1)).normalize()
    horizon_end = (index_time + pd.Timedelta(days=PREDICTION_HORIZON_DAYS)).normalize()
    # Positive if any cluster starts within [horizon_start, horizon_end]
    for (start_time, end_time, n) in clusters:
        if (start_time >= horizon_start) and (start_time <= horizon_end):
            return 1
    # Otherwise negative
    return 0


def main():
    parser = argparse.ArgumentParser(description="Beginner temporal hospitalization model (multi-CSV)")
    parser.add_argument("-f", default=None, help=argparse.SUPPRESS)
    parser.add_argument("--csv", nargs="+", default=DEFAULT_CSV_PATHS,
                        help="One or more CSV file paths.")
    parser.add_argument("--test-size", type=float, default=DEFAULT_TEST_SIZE)
    parser.add_argument("--lookback-days", type=int, default=DEFAULT_LOOKBACK_DAYS)
    parser.add_argument("--sparse-gap-days", type=int, default=DEFAULT_SPARSE_GAP_DAYS)
    parser.add_argument("--dense-gap-days", type=int, default=DEFAULT_DENSE_GAP_DAYS)
    parser.add_argument("--min-cluster-days", type=int, default=DEFAULT_MIN_CLUSTER_DAYS)
    parser.add_argument("--cluster-from-table", default=DEFAULT_CLUSTER_FROM_TABLE)
    args, unknown = parser.parse_known_args()
    if unknown:
        print(f"Ignoring unknown args: {unknown}")

    cluster_from_table = None if str(args.cluster_from_table).lower() == "none" else args.cluster_from_table

    # Load all CSVs, tag each with source_file, then combine  #CHANGES MADE#
    dfs = []
    for path in args.csv:
        p = Path(path).expanduser()
        part = load_csv_simple(str(p))
        part["source_file"] = p.name  #CHANGES MADE#
        dfs.append(part)

    df = (pd.concat(dfs, ignore_index=True)
            .sort_values(["subject_id", "time"])
            .reset_index(drop=True))
    print(f"Combined rows: {len(df)}  Files loaded: {len(dfs)}")

    subjects = df["subject_id"].dropna().unique()
    print(f"Unique subjects: {len(subjects)}")

    rows = []
    # Faster + avoids repeated df[df["subject_id"]==sid] scans  #CHANGES MADE#
    # Faster + avoids repeated df[df["subject_id"]==sid] scans
    for sid, patient in df.groupby("subject_id", sort=False):  #CHANGES MADE#
        clusters = find_clusters_for_patient(
            patient,
            sparse_gap_days=args.sparse_gap_days,
            dense_gap_days=args.dense_gap_days,
            min_cluster_days=args.min_cluster_days,
            cluster_from_table=cluster_from_table
        )
        index_days = build_index_days(patient, clusters, sparse_gap_days=args.sparse_gap_days)

        for sid2, idx_time, idx_day, src_file in index_days:  #CHANGES MADE#
            feats = build_features_for_index(patient, idx_time, lookback_days=args.lookback_days)
            label = label_index_row(idx_time, clusters)
            row = {
                "subject_id": sid2,
                "index_time": idx_time,
                "index_day": idx_day,
                "label": label,
                "source_file": src_file  #CHANGES MADE#
            }
            row.update(feats)
            rows.append(row)

    data = (pd.DataFrame(rows)
              .sort_values(["subject_id", "index_time"])
              .reset_index(drop=True))

    print(f"Dataset size: {len(data)}  Positives: {data['label'].sum()}  Negatives: {len(data)-data['label'].sum()}")

    # Leave-one-CSV-out CV: train 5 times, pool out-of-fold predictions, print ONE final report  #CHANGES MADE#
    meta_cols = ["subject_id", "index_time", "index_day", "label", "source_file"]  #CHANGES MADE#

    # All remaining columns are model features
    feature_cols = [c for c in data.columns if c not in meta_cols]

    files = sorted(data["source_file"].dropna().unique().tolist())  #CHANGES MADE#
    # Identify distinct CSV sources (used as CV folds)
    if len(files) < 2:
        print("Not enough distinct source_file values to run leave-one-CSV-out CV.")
        return

    # Accumulate out-of-fold labels and probabilities
    y_true_all = []
    y_prob_all = []

    for test_file in files:
        # Split by held-out CSV
        test_df = data[data["source_file"] == test_file]
        train_df = data[data["source_file"] != test_file]

        # Skip degenerate folds
        if test_df.empty or train_df.empty:
            continue

        # Extract features and labels
        X_train = train_df[feature_cols].to_numpy(dtype=float)
        y_train = train_df["label"].astype(int).to_numpy()
        X_test = test_df[feature_cols].to_numpy(dtype=float)
        y_test = test_df["label"].astype(int).to_numpy()

        # if training fold has one class, skip fold
        # Skip fold if training data has only one class
        if len(np.unique(y_train)) < 2:
            continue

        # Trees don't need scaling, but they do need NaNs handled
        # Impute missing values and scale features (fit on train only)
        imputer = SimpleImputer(strategy="median")
        X_train_imp = imputer.fit_transform(X_train)
        X_test_imp = imputer.transform(X_test)

        # Train balanced random forest
        model = RandomForestClassifier(
            n_estimators=600,
            max_features="sqrt",
            min_samples_leaf=5,
            class_weight="balanced_subsample",
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train_imp, y_train)

        # Predict probabilities for the held-out CSV
        y_prob = model.predict_proba(X_test_imp)[:, 1]

        # Store out-of-fold predictions
        y_true_all.append(y_test)
        y_prob_all.append(y_prob)

    # Abort if no valid folds produced predictions
    if not y_true_all:
        print("No CV folds produced predictions (check class balance per fold / source_file assignment).")
        return

    # Pool all out-of-fold predictions
    y_true_all = np.concatenate(y_true_all)
    y_prob_all = np.concatenate(y_prob_all)
    y_pred_all = (y_prob_all >= 0.5).astype(int)

    # Print ONE combined result, in the style you want  #CHANGES MADE#
    # Print a single combined evaluation across all folds
    print(f"Test rows: {len(y_true_all)}  Positives: {int(y_true_all.sum())}  Negatives: {int(len(y_true_all)-y_true_all.sum())}")

    if len(np.unique(y_true_all)) > 1:
        print("AUROC:", round(roc_auc_score(y_true_all, y_prob_all), 3))
        print("AUPRC:", round(average_precision_score(y_true_all, y_prob_all), 3))
    print("Brier:", round(brier_score_loss(y_true_all, y_prob_all), 4))
    print(classification_report(y_true_all, y_pred_all, digits=3))


if __name__ == "__main__":
    main()

IT IS RUNNING!
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_0.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_1.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_2.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_3.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_4.csv
Combined rows: 10502789  Files loaded: 5
Unique subjects: 946
Dataset size: 62171  Positives: 4058  Negatives: 58113
Test rows: 62171  Positives: 4058  Negatives: 58113
AUROC: 0.562
AUPRC: 0.078
Brier: 0.0999
              precision    recall  f1-score   support

           0      0.935     0.995     0.964     58113
  