In [8]:
"""
What this script does:
1. Load a CSV with patient events.
2. For each patient:
   - Find clusters of activity days where several reading taken in conjunction with eachother. 
   - Build index days outside clusters after a sparse gap.
   - Compute basic features (last value, days since last, mean, count) for 4 codes over lookback window.
   - Label: cluster start between days 8..180 after index.
3. Combine all indices, temporal split, train Logistic Regression, evaluate.
"""

import argparse
from pathlib import Path
import sys
import math

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# DEFAULT SETTINGS
DEFAULT_CSV_PATHS = ["/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_0.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_1.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_2.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_3.csv",
                     "/Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_4.csv"]

DEFAULT_TEST_SIZE = 0.2 #Fraction of rows held out for testing. If 0.2 - means 20% data used for testing. 
DEFAULT_LOOKBACK_DAYS = 90 #for each index day, look back X days to see what the patient’s measurements were.  
DEFAULT_SPARSE_GAP_DAYS = 6 #only mark a day as a new index if there has been at least a X‑day quiet spell 
DEFAULT_DENSE_GAP_DAYS = 2 #when finding clusters, if days <X days apart, they are part of the same cluster 
DEFAULT_MIN_CLUSTER_DAYS = 4 #A run of active days must last at least X distinct days to count as a cluster (possible hospitalisation).
DEFAULT_CLUSTER_FROM_TABLE = "measurement" #dont want to include non-measurements here 
EXCLUSION_BEFORE_CLUSTER_DAYS = 7 #prevents us including patients already hospitalised - model anticipates, not detects 
PREDICTION_HORIZON_DAYS = 90 #index = positive if a cluster starts EXCLUSION_BEFORE_CLUSTER_DAYS < cluster <PREDICTION_HORIZON_DAYS

CODE_SBP = "LOINC/8480-6" #systolic BP
CODE_HR = "LOINC/8867-4" #heart rate 
CODE_GLUCOSE = "SNOMED/271649006" #glucose levels 
CODE_HBA1C = "SNOMED/271650006" #HBA1c levels 

EXPECTED_COLUMNS = [  
    "subject_id","time","code","numeric_value","care_site_id","clarity_table",
    "end","note_id","provider_id","table","text_value","unit","visit_id"]  #set columns for the INSPECT dataset. 

#FUNCTIONS

#function to ensure consistent datetime values across mixed date formats. 
def parse_mixed_datetime(series): 
    series = series.astype("string")
    mask_slash = series.str.contains("/", na=False)
    parsed_slash = pd.to_datetime(series.where(mask_slash), format="%d/%m/%Y %H:%M", errors="coerce")
    parsed_other = pd.to_datetime(series.where(~mask_slash), errors="coerce", dayfirst=False)
    return parsed_slash.fillna(parsed_other)

#check the CSV is in correct format and clean up any messy data. Used AI to ensure ran properly 
def load_csv_simple(path):
    print("Loading CSV from", path)
    df = pd.read_csv(path, sep=",", encoding="utf-8-sig", low_memory=False) #used AI here to encoding="utf-8-sig", low_memory=False
    df["time"] = parse_mixed_datetime(df["time"])
    df["numeric_value"] = pd.to_numeric(df.get("numeric_value"), errors="coerce")
    df = df.sort_values(["subject_id","time"], kind="mergesort").reset_index(drop=True)
    return df

#used to look for clusters - I used AI to make sure the programme ran correctly i struggling with this. 
def find_clusters_for_patient(patient_df, sparse_gap_days, dense_gap_days, min_cluster_days, cluster_from_table):
    if cluster_from_table is not None and "table" in patient_df.columns: #run detection only on measurement rows 
        chosen = patient_df[(patient_df["table"] == cluster_from_table) & (~patient_df["time"].isna())].copy() #Keeps only relevant activity days and drops rows without timestamps 
        if chosen.empty: 
            chosen = patient_df.dropna(subset=["time"]).copy() #no rows of the requested table for this patient, use all timed rows so the patient isn’t skipped entirely.
    else:
        chosen = patient_df.dropna(subset=["time"]).copy() #Use all rows (with valid time) to detect clusters 
    if chosen.empty:
        return [] #early exit if patient has no time stamps. 
    chosen["day"] = chosen["time"].dt.normalize() #multiple events on the same calendar day count as a single “active day” in cluster detection.
    unique_days = sorted(chosen["day"].unique().tolist()) #Collect sorted unique active days
    clusters = [] #storage for clusters 
    if len(unique_days) < min_cluster_days: #Early exit if not enough days to form even one cluster
        return clusters
    day_diffs = []
    for i in range(1, len(unique_days)):
        gap_days = (unique_days[i] - unique_days[i-1]).days
        day_diffs.append(gap_days)
    run_start_index = 0
    for i, gap in enumerate(day_diffs, start=1):
        if gap <= dense_gap_days:
            continue
        run_end_index = i - 1
        run_length = (run_end_index - run_start_index + 1)
        if run_length >= min_cluster_days:
            preceding_gap = math.inf if run_start_index == 0 else day_diffs[run_start_index - 1]
            if preceding_gap >= sparse_gap_days:
                start_day = unique_days[run_start_index]
                end_day = unique_days[run_end_index]
                clusters.append((start_day.normalize(), end_day.normalize(), run_length))
        run_start_index = i
    run_end_index = len(unique_days) - 1
    run_length = (run_end_index - run_start_index + 1)
    if run_length >= min_cluster_days:
        preceding_gap = math.inf if run_start_index == 0 else day_diffs[run_start_index - 1]
        if preceding_gap >= sparse_gap_days:
            start_day = unique_days[run_start_index]
            end_day = unique_days[run_end_index]
            clusters.append((start_day.normalize(), end_day.normalize(), run_length))
    return clusters

#“index” timestamps for a patient: looking for days that are outside any cluster and occur after a sufficiently long quiet gap
def build_index_days(patient_df, clusters, sparse_gap_days):
    df = patient_df.dropna(subset=["time"]).copy()
    if df.empty:
        return []
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    df = df.dropna(subset=["time"])
    df["day"] = df["time"].dt.normalize()

    cluster_day_set = {
        pd.Timestamp(d).normalize()
        for start_day, end_day, _ in clusters
        for d in pd.date_range(start_day, end_day, freq="D")}

    day_rows = df[["day"]].drop_duplicates().sort_values("day").reset_index(drop=True)
    index_list, prev_day = [], None
    sid = patient_df["subject_id"].iloc[0]  # keep type as-is

    for i in range(len(day_rows)):
        current_day = day_rows.loc[i, "day"]
        in_cluster = current_day in cluster_day_set
        gap_days = math.inf if prev_day is None else (current_day - prev_day).days
        if (not in_cluster) and gap_days >= sparse_gap_days:
            first_time = df.loc[df["day"] == current_day, "time"].min()
            index_list.append((sid, first_time, current_day))
        prev_day = current_day

    return index_list


#for each lab (BP/HR/HBA1C/GLUCOSE) finds :
#1.The most recent value before the prediction time
#2.How long ago that measurement happened
#3.The average and count of values in the past N days. 
#adds to result dict 

def build_features_for_index(patient_df, index_time, lookback_days):
    result = {}
    meas_df = patient_df.copy() #producd new df to avoid overwriting
    if "table" in meas_df.columns:
        meas_df = meas_df[meas_df["table"] == "measurement"].copy() #filter to measurements - ignore the rest 
    lookback_start = index_time - pd.Timedelta(days=lookback_days) #calc earliest timestamp included in the lookback window
    codes = [CODE_SBP, CODE_HR, CODE_GLUCOSE, CODE_HBA1C]
    for code_val in codes:
        col_prefix = code_val.replace("/", "_") #standardise codes 
        code_df = meas_df[(meas_df["code"] == code_val) & (~meas_df["time"].isna())].copy() #Filter to rows for the current code with a non-null timestamp
        last_df = code_df[code_df["time"] <= index_time].sort_values("time") #Keep only rows occurring at or before the index time (no future leakage) and sort chronologically.
        if last_df.empty: # if no last_val/last_days_since, set to None. 
            last_val = np.nan
            last_days_since = np.nan
        else:             
            last_row = last_df.iloc[-1] #take last row after sort and 
            last_val = last_row["numeric_value"] #store its value 
            last_days_since = (index_time - last_row["time"]).days #Compute how many full days ago it occurred relative to index_time
        window_df = code_df[(code_df["time"] >= lookback_start) & (code_df["time"] <= index_time)]
        window_vals = window_df["numeric_value"].dropna()
        if window_vals.empty:
            window_mean = np.nan
            window_count = 0
        else:
            window_mean = float(window_vals.mean())
            window_count = int(len(window_vals))
        result[f"{col_prefix}_last"] = last_val
        result[f"{col_prefix}_days_since_last"] = last_days_since
        result[f"{col_prefix}_mean_{lookback_days}d"] = window_mean
        result[f"{col_prefix}_count_{lookback_days}d"] = window_count
    return result


#Will a cluster start between horizon_start and horizon_end?”
#1 is pos (cluster starts)
#0 is neg (cluster doesnt start)
def label_index_row(index_time, clusters): 
    if not clusters: 
        return 0
    horizon_start = (index_time + pd.Timedelta(days=EXCLUSION_BEFORE_CLUSTER_DAYS + 1)).normalize()
    horizon_end = (index_time + pd.Timedelta(days=PREDICTION_HORIZON_DAYS)).normalize()
    for (start_time,end_time,n) in clusters:
        if (start_time >= horizon_start) and (start_time <= horizon_end):
            return 1
    return 0

'''USED AI FOR THIS:::
Split a dataset of index rows into a training and testing set based on chronological order (not random). 
The most recent fraction (defined by test_size) becomes the test set; earlier rows become the training set. 
It returns two NumPy arrays of row indices: train_idx and test_idx. 
'''
def temporal_split(dataset_df, test_size):
    df_sorted = dataset_df.sort_values("index_time").reset_index(drop=True)
    if df_sorted.empty:
        return np.array([],dtype=int), np.array([],dtype=int)
    cutoff = df_sorted["index_time"].quantile(1 - test_size)
    train_idx = df_sorted.index[df_sorted["index_time"] < cutoff].to_numpy()
    test_idx = df_sorted.index[df_sorted["index_time"] >= cutoff].to_numpy()
    if len(train_idx) == 0 and len(df_sorted) >= 2:
        train_idx = np.arange(0, len(df_sorted)-1)
        test_idx = np.arange(len(df_sorted)-1, len(df_sorted))
    if len(test_idx) == 0 and len(df_sorted) >= 2:
        split_pt = int(np.floor((1 - test_size)*len(df_sorted)))
        train_idx = np.arange(0, max(split_pt,1))
        test_idx = np.arange(max(split_pt,1), len(df_sorted))
    return train_idx, test_idx


#loading data in using argparse. 
def main():
    parser = argparse.ArgumentParser(description="Beginner temporal hospitalization model (multi-CSV)")
    parser.add_argument("-f", default=None, help=argparse.SUPPRESS)
    parser.add_argument("--csv", nargs="+", default=DEFAULT_CSV_PATHS,
                        help="One or more CSV file paths.")
    parser.add_argument("--test-size", type=float, default=DEFAULT_TEST_SIZE)
    parser.add_argument("--lookback-days", type=int, default=DEFAULT_LOOKBACK_DAYS)
    parser.add_argument("--sparse-gap-days", type=int, default=DEFAULT_SPARSE_GAP_DAYS)
    parser.add_argument("--dense-gap-days", type=int, default=DEFAULT_DENSE_GAP_DAYS)
    parser.add_argument("--min-cluster-days", type=int, default=DEFAULT_MIN_CLUSTER_DAYS)
    parser.add_argument("--cluster-from-table", default=DEFAULT_CLUSTER_FROM_TABLE)
    parser.add_argument("--out-features", default=None)
    parser.add_argument("--out-preds", default=None)
    args, unknown = parser.parse_known_args()
    if unknown:
        print(f"Ignoring unknown args: {unknown}")

    cluster_from_table = None if str(args.cluster_from_table).lower() == "none" else args.cluster_from_table
    
#combine the 5 CSVs into a single CSV
    dfs = []
    for path in args.csv:
        p = Path(path).expanduser()
        part = load_csv_simple(str(p))
        dfs.append(part)
    
    df = (pd.concat(dfs, ignore_index=True)
            .sort_values(["subject_id", "time"])
            .reset_index(drop=True))
    print(f"Combined rows: {len(df)}  Files loaded: {len(dfs)}")
    
    subjects = df["subject_id"].dropna().unique()
    print(f"Unique subjects: {len(subjects)}") 
    
    rows = []
    for sid in subjects: #make a dffor each unique patient 
        patient = df[df["subject_id"] == sid]
        clusters = find_clusters_for_patient( #use previous cluster function to find patient clusters 
            patient,
            sparse_gap_days=args.sparse_gap_days,
            dense_gap_days=args.dense_gap_days,
            min_cluster_days=args.min_cluster_days,
            cluster_from_table=cluster_from_table)
        index_days = build_index_days(patient, clusters, sparse_gap_days=args.sparse_gap_days)#use previous function to look for days that are outside any cluster
        for sid2, idx_time, idx_day in index_days: #for patients index days 
            feats = build_features_for_index(patient, idx_time, lookback_days=args.lookback_days) #use previous function to find code values in and out of cluster 
            label = label_index_row(idx_time, clusters) #looks to see whether a cluster starts or doesnt start
            row = {"subject_id": sid2, "index_time": idx_time, "index_day": idx_day, "label": label}
            row.update(feats) #Merges the feature dictionary into the row dictionary
            rows.append(row)

    data = (pd.DataFrame(rows) #Converts the accumulated list of dicts above into a pandas DataFrame
              .sort_values(["subject_id", "index_time"])
              .reset_index(drop=True))
    print(f"Dataset size: {len(data)}  Positives: {data['label'].sum()}  Negatives: {len(data)-data['label'].sum()}")

    
    train_idx, test_idx = temporal_split(data, args.test_size) #split the dataset (data we made above) into training and test indices and set parameters for training 
    meta_cols = ["subject_id", "index_time", "index_day", "label"]
    feature_cols = [c for c in data.columns if c not in meta_cols]
    X = data[feature_cols].to_numpy(dtype=float) #extract features into numpy 
    y = data["label"].astype(int).to_numpy() #target labels (1 for cluster, 0 for not) into numpy 
    X_train, y_train = X[train_idx], y[train_idx] #feature and label selected training
    X_test, y_test = X[test_idx], y[test_idx]#feature and label selected testing

    if len(np.unique(y_train)) < 2: #make sure enough features to train 
        print("One class in training set.")
        return

    #missing training and test data training medians
    imputer = SimpleImputer(strategy="median") 
    scaler = StandardScaler() 
    X_train_imp = imputer.fit_transform(X_train)
    X_test_imp = imputer.transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train_imp)
    X_test_scaled = scaler.transform(X_test_imp)

    #model this 
    model = LogisticRegression(class_weight="balanced", max_iter=2000)
    model.fit(X_train_scaled, y_train)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)
    #print our values out. 
    print(f"Test rows: {len(y_test)}  Positives: {y_test.sum()}  Negatives: {len(y_test)-y_test.sum()}")
    if len(np.unique(y_test)) > 1:
        try: print("AUROC:", round(roc_auc_score(y_test, y_prob), 3))
        except: pass
        try: print("AUPRC:", round(average_precision_score(y_test, y_prob), 3))
        except: pass
        try: print("Brier:", round(brier_score_loss(y_test, y_prob), 4))
        except: pass
    print(classification_report(y_test, y_pred, digits=3))

In [10]:
main()

Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_0.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_1.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_2.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_3.csv
Loading CSV from /Users/colleenohare/Desktop/Bioinformatics_MSC/RETFOUND/Chris_Sainsbury/Inspect_Dataset/inspect_data_csv/data_4.csv
Combined rows: 10502789  Files loaded: 5
Unique subjects: 946
Dataset size: 62171  Positives: 4058  Negatives: 58113
Test rows: 12441  Positives: 739  Negatives: 11702
AUROC: 0.632
AUPRC: 0.127
Brier: 0.2315
              precision    recall  f1-score   support

           0      0.957     0.731     0.829     11702
           1      