# Build giant tables for multi-dataset patients

In [1]:
import os
import numpy as np
import pandas as pd

import scipy.io
from scipy.stats import iqr
from scipy import interpolate

In [2]:
# Load HUP_implant_dates.xlsx
patients_df = pd.read_excel("../../Data/HUP_implant_dates.xlsx")
patients_df

Unnamed: 0,hup_id,IEEG_Portal_Number,Implant_Date,implant_time,Explant_Date,weight_kg
0,225,HUP225_phaseII,2021-10-18,07:15:00,2021-10-26 17:30:00,58.5
1,224,HUP224_phaseII,2021-10-13,07:15:00,2021-10-20 00:00:00,85.5
2,223,HUP223_phaseII,2021-09-29,07:15:00,2021-10-08 08:21:00,101.4
3,221,HUP221_phaseII,2021-08-16,07:15:00,2021-08-23 00:00:00,124.3
4,219,HUP219_phaseII,2021-07-12,07:15:00,2021-07-16 08:18:00,101.6
...,...,...,...,...,...,...
75,141,HUP141_phaseII,2017-05-24,07:15:00,2017-06-01 00:00:00,85.7
76,140,HUP140_phaseII_D01-D02,2017-05-10,07:15:00,2017-05-19 00:00:00,56.7
77,139,HUP139_phaseII,2017-04-26,07:15:00,2017-05-09 00:00:00,69.8
78,138,HUP138_phaseII,2017-04-12,07:15:00,2017-04-20 00:00:00,84.4


In [3]:
# Create a mapping between patient ids and the index of the patient in the patients_df dataframe
patient_hup_id_to_index = {}
for i, patient_id in enumerate(patients_df["hup_id"]):
    patient_hup_id_to_index[patient_id] = i

In [4]:
def get_patient_hup_ids(directory):
    # List all files in the directory
    files = os.listdir(directory)

    # Filter out files based on the given pattern and extract patient_hup_id as integers
    patient_hup_ids = [
        int(f.split("_")[1].split(".")[0])
        for f in files
        if f.startswith("HUP_") and f.endswith(".npy")
    ]

    return patient_hup_ids


directory = "../../Data/synchrony/all/broadband_multi_dataset_combined/"
completed_hup_ids = get_patient_hup_ids(directory)
# Only keep unique patient ids
completed_hup_ids = list(set(completed_hup_ids))
completed_hup_ids.sort()
print(completed_hup_ids)
len(completed_hup_ids)

[137, 140, 147, 148, 149, 152, 153, 156, 159, 167, 168, 179, 181, 193, 194, 195, 197, 201, 208, 209, 213, 214, 215, 216]


24

In [5]:
# Only keep the rows in patients_df that correspond to the completed_hup_ids
patients_df = patients_df[patients_df["hup_id"].isin(completed_hup_ids)]
# reset the index of patients_df
patients_df = patients_df.reset_index(drop=True)
# Add a column called num_datasets that is the last character of IEEG_Portal_Number
patients_df["num_datasets"] = patients_df["IEEG_Portal_Number"].apply(
    lambda x: int(x[-1])
)
# Make the num_datasets column int
patients_df["num_datasets"] = patients_df["num_datasets"].astype(int)
patients_df

Unnamed: 0,hup_id,IEEG_Portal_Number,Implant_Date,implant_time,Explant_Date,weight_kg,num_datasets
0,216,HUP216_phaseII_D01-D02,2021-04-05,07:15:00,2021-04-20 00:00:00,77.1,2
1,215,HUP215_phaseII_D01-04,2021-01-25,07:15:00,2021-02-05 00:00:00,90.3,4
2,214,HUP214_phaseII_D01-D02,2021-01-13,07:15:00,2021-01-27 00:00:00,59.2,2
3,213,HUP213_phaseII_D01-02,2021-01-04,07:15:00,2021-01-29 00:00:00,87.5,2
4,209,HUP209_phaseII_D01-D02,2020-11-09,07:15:00,2020-11-25 17:44:00,70.3,2
5,208,HUP208_phaseII_D01-D02,2020-10-26,07:15:00,2020-11-07 00:00:00,85.4,2
6,201,HUP201_phaseII_D01-D02,2020-02-26,07:15:00,2020-03-11 00:00:00,97.1,2
7,197,HUP197_phaseII_D01-02,2019-10-28,07:15:00,2019-11-15 00:00:00,88.8,2
8,195,HUP195_phaseII_D01-D03,2019-07-31,07:15:00,2019-08-12 17:05:00,63.5,3
9,194,HUP194_phaseII_D01-D03,2019-09-25,07:15:00,2019-10-07 00:00:00,72.6,3


In [6]:
ieeg_offset_df = pd.read_excel("../../Data/ieeg_offset_new.xlsx")
# Only keep the rows in ieeg_offset_df that correspond to the completed_hup_ids
ieeg_offset_df = ieeg_offset_df[ieeg_offset_df["hup_id"].isin(completed_hup_ids)]
# reset the index of ieeg_offset_df
ieeg_offset_df = ieeg_offset_df.reset_index(drop=True)
ieeg_offset_df

Unnamed: 0,hup_id,ieeg_offset_1,ieeg_offset_2,ieeg_offset_3,ieeg_offset_4
0,216,111832,715546.0,,
1,215,140348,192645.0,744987.0,
2,214,39906,369961.0,,
3,213,127767,716056.0,,
4,209,66189,713741.0,,
5,208,134301,540024.0,,
6,201,125353,629129.0,,
7,197,127860,457202.0,,
8,195,133216,492600.0,557580.0,
9,194,125874,199500.0,681720.0,


In [7]:
all_med_names = []

for i, row in patients_df.iterrows():
    # Get patient id and weight
    patient_hup_id = row.hup_id

    # Load HUP_{patient_hup_id}.npy from ../../Data/medications
    aed_np_file = np.load(
        f"../../Data/medications/HUP_{patient_hup_id}.npy", allow_pickle=True
    )

    all_dose_curves_plot = aed_np_file[0]
    all_tHr_plot = aed_np_file[1]
    all_med_names_plot = aed_np_file[2]

    # Plot dose curves
    for med_name in all_med_names_plot:
        all_med_names.append(med_name)

all_med_names = np.unique(np.array(all_med_names, dtype=str))
all_med_names

array(['brivaracetam', 'carbamazepine', 'clobazam', 'clonazepam',
       'clorazepate', 'eslicarbazepine', 'felbamate', 'lacosamide',
       'lamotrigine', 'levetiracetam', 'lorazepam', 'oxcarbazepine',
       'phenytoin', 'pregabalin', 'topiramate', 'zonisamide'],
      dtype='<U15')

In [8]:
# Load aed_ref_ranges.xlsx from ./data/
aed_ref_ranges_df = pd.read_excel("../../Data/aed_ref_ranges.xlsx")
# Lowercase Drug column
aed_ref_ranges_df["Drug"] = aed_ref_ranges_df["Drug"].str.lower()
# show unique units
print(aed_ref_ranges_df["Unit"].unique())
# mg/L and ug/mL are the same
# If Unit is ng/mL, convert to ug/mL
aed_ref_ranges_df.loc[aed_ref_ranges_df["Unit"] == "ng/mL", "Min"] = (
    aed_ref_ranges_df["Min"] / 1000
)
aed_ref_ranges_df.loc[aed_ref_ranges_df["Unit"] == "ng/mL", "Max"] = (
    aed_ref_ranges_df["Max"] / 1000
)
# Add a column that takes the average of Min and Max
aed_ref_ranges_df["Avg"] = (aed_ref_ranges_df["Min"] + aed_ref_ranges_df["Max"]) / 2
aed_ref_ranges_df

['mg/L' 'ug/mL' 'ng/mL']


Unnamed: 0,Drug,Min,Max,Unit,Avg
0,levetiracetam,12.0,46.0,mg/L,29.0
1,carbamazepine,4.0,10.0,mg/L,7.0
2,oxcarbazepine,3.0,35.0,ug/mL,19.0
3,clobazam,0.03,0.3,ng/mL,0.165
4,n-desmethylclobazam,0.3,3.0,ng/mL,1.65
5,topiramate,5.0,20.0,mg/L,12.5
6,valproic acid,50.0,125.0,ug/mL,87.5
7,lacosamide,1.0,10.0,ug/mL,5.5
8,felbamate,30.0,60.0,ug/mL,45.0
9,lamotrigine,2.5,15.0,mg/L,8.75


In [9]:
# Add a row to aed_ref_ranges_df with Drug = "clorazepate" and Avg = 20
aed_ref_ranges_df = aed_ref_ranges_df.append(
    {"Drug": "clorazepate", "Avg": 20}, ignore_index=True
)

  aed_ref_ranges_df = aed_ref_ranges_df.append(


In [10]:
# frequency_bands = ["broadband", "60_100", "100_125"]
frequency_bands = ["broadband"]

In [11]:
for i, row in patients_df.iterrows():
    # Get patient id and weight
    patient_hup_id, patient_weight = row.hup_id, row.weight_kg
    patient_idx = patient_hup_id_to_index[patient_hup_id]
    print(f"Processing HUP {patient_hup_id}")

    # Find the ieeg_offset_1 value for patient_hup_id in ieeg_offset_df and convert it into float
    ieeg_offset_seconds = float(
        ieeg_offset_df.loc[
            ieeg_offset_df["hup_id"] == patient_hup_id, "ieeg_offset_1"
        ].values[0]
    )
    print(f"ieeg_offset_seconds: {ieeg_offset_seconds}")
    ieeg_offset_minutes = ieeg_offset_seconds / 60

    ##############################################
    # MEDICATIONS
    ##############################################
    # Load HUP_{patient_hup_id}.npy from ../../Data/medications
    aed_np_file = np.load(
        f"../../Data/medications/HUP_{patient_hup_id}.npy", allow_pickle=True
    )

    all_dose_curves_plot = aed_np_file[0]
    all_tHr_plot = aed_np_file[1]
    all_med_names_plot = aed_np_file[2]

    # Construct the time axis
    emu_start_time_hrs = min([all_tHr_plot[i][0] for i in range(len(all_tHr_plot))])
    emu_end_time_hrs = all_tHr_plot[0][-1]
    max_length = max([len(all_tHr_plot[i]) for i in range(len(all_tHr_plot))])
    time_axis = np.linspace(emu_start_time_hrs, emu_end_time_hrs, max_length)

    first_emu_hr = time_axis[0]

    # Create a dataframe that will hold the dose curves for all patients
    hourly_patient_features_df = pd.DataFrame(columns=["emu_time"])
    hourly_patient_features_df["emu_time"] = time_axis

    for potential_med_name in all_med_names:
        hourly_patient_features_df[f"med_{potential_med_name}_raw"] = np.zeros(
            len(time_axis)
        )

    sum_array = []

    ##############################################
    # MEDICATIONS Normalize to 1
    ##############################################
    for med_idx, med_name in enumerate(all_med_names_plot):
        dose_times = all_tHr_plot[med_idx].flatten()
        dose = all_dose_curves_plot[med_idx].flatten()

        interp_func = interpolate.interp1d(
            dose_times, dose, bounds_error=False, fill_value=0
        )
        dose_interp = interp_func(time_axis)

        if med_name != "lorazepam":
            sum_array.append(dose_interp)

        hourly_patient_features_df[f"med_{med_name}_raw"] = dose_interp

    cumulative_dose_curve = np.sum(sum_array, axis=0)
    cumulative_dose_curve = cumulative_dose_curve / np.max(cumulative_dose_curve)

    assert len(cumulative_dose_curve) == len(
        time_axis
    ), "cumulative_dose_curve and time_axis should have the same length"

    hourly_patient_features_df["med_sum_no_lorazepam_raw"] = cumulative_dose_curve

    ##############################################
    # MEDICATIONS Normalize with DDD
    ##############################################
    for med_idx, med_name in enumerate(all_med_names_plot):
        dose_times = all_tHr_plot[med_idx].flatten()

        # Find Avg for medication med_name in aed_ref_ranges_df
        if med_name != "lorazepam":
            ref_range = float(
                aed_ref_ranges_df.loc[
                    aed_ref_ranges_df["Drug"] == med_name, "Avg"
                ].values[0]
            )
        else:
            ref_range = 1

        dose = all_dose_curves_plot[med_idx].flatten()
        dose = dose / ref_range

        interp_func = interpolate.interp1d(
            dose_times, dose, bounds_error=False, fill_value=0
        )
        dose_interp = interp_func(time_axis)

        if med_name != "lorazepam":
            sum_array.append(dose_interp)

        hourly_patient_features_df[f"med_{med_name}_raw"] = dose_interp

    cumulative_dose_curve = np.sum(sum_array, axis=0)

    assert len(cumulative_dose_curve) == len(
        time_axis
    ), "cumulative_dose_curve and time_axis should have the same length"

    hourly_patient_features_df["med_sum_no_lorazepam_ddd"] = cumulative_dose_curve

    ##############################################
    # Group by 2 minutes and compute mean
    ##############################################
    hourly_patient_features_df["emu_minute"] = (
        (hourly_patient_features_df["emu_time"] * 60).astype(int) // 2 * 2
    )
    hourly_patient_features_df = hourly_patient_features_df.groupby("emu_minute").mean()
    hourly_patient_features_df = hourly_patient_features_df.reset_index()
    hourly_patient_features_df = hourly_patient_features_df.drop(columns=["emu_time"])

    ##############################################
    # SEIZURE COUNT
    ##############################################
    seizure_times_sec = np.load(
        f"../../Data/seizures/source_mat/HUP_{patient_hup_id}.npy"
    )
    seizure_times_sec = seizure_times_sec + ieeg_offset_seconds

    # Convert seizure times from seconds to minutes
    seizure_times_min = seizure_times_sec / 60

    hourly_patient_features_df["had_seizure"] = np.zeros(
        len(hourly_patient_features_df), dtype=int
    )

    for sz_min in seizure_times_min[:, 0]:
        hourly_patient_features_df.loc[
            hourly_patient_features_df["emu_minute"] == int(sz_min) // 2 * 2,
            "had_seizure",
        ] += 1

    ##############################################
    # Time since last seizure
    ##############################################
    # Initialize the list and timer
    time_since_last_seizure = []
    timer = None

    # Loop through the dataframe and calculate the time since the last seizure
    for had_seizure in hourly_patient_features_df["had_seizure"]:
        if had_seizure == 1:
            timer = 0
        elif timer is not None:  # if there has been a seizure before
            timer += 2
        else:
            timer = None
        time_since_last_seizure.append(timer)

    # Add the list as a new column
    hourly_patient_features_df["time_since_last_seizure"] = time_since_last_seizure

    ##########################################
    # SYNCHRONY
    ##########################################

    # Determine the starting index for the synchrony data
    start_index = None
    for i, emu_min in enumerate(hourly_patient_features_df["emu_minute"]):
        if i < len(hourly_patient_features_df["emu_minute"]) - 1:
            next_emu_min = hourly_patient_features_df["emu_minute"].iloc[i + 1]
        else:
            next_emu_min = emu_min + 2

        if emu_min <= ieeg_offset_minutes < next_emu_min:
            start_index = i
            break

    if start_index is None:
        print("start_index is actually 0...")
        start_index = 0

    for frequency_band in frequency_bands:
        synchrony_np = np.load(
            f"../../Data/synchrony/all/broadband_multi_dataset_combined/HUP_{patient_hup_id}.npy"
        )

        # Initialize the synchrony column with NaNs
        hourly_patient_features_df[f"synchrony_{frequency_band}"] = np.nan

        # Insert synchrony values starting from the appropriate index
        end_index = min(
            start_index + len(synchrony_np), len(hourly_patient_features_df)
        )
        hourly_patient_features_df.iloc[
            start_index:end_index,
            hourly_patient_features_df.columns.get_loc(f"synchrony_{frequency_band}"),
        ] = synchrony_np[: end_index - start_index]

    ##########################################
    # AD Ratio
    ##########################################
    mat_file = scipy.io.loadmat(
        f"../../../erinconr/projects/fc_toolbox/results/analysis/intermediate/HUP{patient_hup_id}.mat"
    )
    mat_file = mat_file["summ"][0][0]
    ad_ratio = mat_file[17]
    num_channels = mat_file[6].shape[0]
    assert ad_ratio.shape[0] == num_channels

    ad_ratio = np.nanmean(ad_ratio, axis=0)
    ad_ratio = (ad_ratio - np.nanmedian(ad_ratio)) / iqr(ad_ratio, nan_policy="omit")
    assert np.nansum(ad_ratio) != 0

    # Reshape ad_ratio to match the granularity of the dataframe
    reshaped_ad_ratio = np.repeat(ad_ratio, 5)

    # Initialize the ad_ratio column with NaNs
    hourly_patient_features_df["ad_ratio"] = np.nan

    # Insert reshaped_ad_ratio values starting from the appropriate index
    end_index_ad_ratio = min(
        start_index + len(reshaped_ad_ratio), len(hourly_patient_features_df)
    )
    hourly_patient_features_df.iloc[
        start_index:end_index_ad_ratio,
        hourly_patient_features_df.columns.get_loc("ad_ratio"),
    ] = reshaped_ad_ratio[: end_index_ad_ratio - start_index]

    ##########################################
    # EEG time
    ##########################################
    # Create the eeg_time column with NaN values
    hourly_patient_features_df["eeg_time"] = np.nan

    # Define the eeg_time values starting from start_index
    eeg_time_values = np.arange(
        0, (end_index_ad_ratio - start_index) * 2, 2
    )  # incrementing by 2 since the time is grouped by 2 minutes
    hourly_patient_features_df.iloc[
        start_index:end_index_ad_ratio,
        hourly_patient_features_df.columns.get_loc("eeg_time"),
    ] = eeg_time_values

    ##############################################
    # SAVE TO CSV
    ##############################################

    hourly_patient_features_df.to_csv(
        f"../../Data/giant_tables_multi_dataset/HUP_{patient_hup_id}.csv", index=False
    )

Processing HUP 216
ieeg_offset_seconds: 111832.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 215
ieeg_offset_seconds: 140348.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 214
ieeg_offset_seconds: 39906.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 213
ieeg_offset_seconds: 127767.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 209
ieeg_offset_seconds: 66189.0
start_index is actually 0...


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 208
ieeg_offset_seconds: 134301.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 201
ieeg_offset_seconds: 125353.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 197
ieeg_offset_seconds: 127860.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 195
ieeg_offset_seconds: 133216.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 194
ieeg_offset_seconds: 125874.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 193
ieeg_offset_seconds: 141614.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 181
ieeg_offset_seconds: 126628.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 179
ieeg_offset_seconds: 132732.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 168
ieeg_offset_seconds: 280831.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 167
ieeg_offset_seconds: 146564.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 159
ieeg_offset_seconds: 122834.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 156
ieeg_offset_seconds: 129269.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 153
ieeg_offset_seconds: 130316.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 152
ieeg_offset_seconds: 134267.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 149
ieeg_offset_seconds: 128245.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 148
ieeg_offset_seconds: 133729.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 147
ieeg_offset_seconds: 139686.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 140
ieeg_offset_seconds: 126678.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)


Processing HUP 137
ieeg_offset_seconds: 48893.0


  ad_ratio = np.nanmean(ad_ratio, axis=0)
