In [1]:
import os
import numpy as np
import pandas as pd
from ieeg.auth import Session
from datetime import datetime, date

from get_iEEG_data import *
from iEEG_helper_functions import *

In [2]:
SOURCE_DIRECTORY = "../../Data/synchrony/all/broadband_multi_dataset"
TARGET_DIRECTORY = "../../Data/synchrony/all/broadband_multi_dataset_combined"

In [3]:
# Load ../../Data/multi_dataset_batches.csv as a pandas dataframe
patients_df = pd.read_csv("../../Data/multi_dataset_batches.csv")
# Only keep hup_id and num_datasets
patients_df = patients_df[["hup_id", "num_datasets"]]
patients_df

Unnamed: 0,hup_id,num_datasets
0,193,3
1,181,2
2,137,3
3,195,3
4,148,2
5,156,3
6,147,2
7,179,2
8,194,3
9,167,2


In [4]:
file_start_times_df = pd.read_csv("../../Data/file_start_times.csv")
# the HUP ID column is formatted as HUPXXX, create a new column with just the integer
file_start_times_df["hup_id"] = file_start_times_df["HUP ID"].str[3:].astype(int)
# Drop the HUP ID column
file_start_times_df = file_start_times_df.drop(columns=["HUP ID"])
# Only keep the rows in file_start_times_df that are also in patients_df
file_start_times_df = file_start_times_df[
    file_start_times_df["hup_id"].isin(patients_df["hup_id"])
]
# Reset the index
file_start_times_df = file_start_times_df.reset_index(drop=True)
# Drop columns that are all nan
file_start_times_df = file_start_times_df.dropna(axis=1, how="all")
# Make columns 1, 2, 3, 4 datetime objects with only time
file_start_times_df["1"] = pd.to_datetime(file_start_times_df["1"])
file_start_times_df["2"] = pd.to_datetime(file_start_times_df["2"])
file_start_times_df["3"] = pd.to_datetime(file_start_times_df["3"])
file_start_times_df["4"] = pd.to_datetime(file_start_times_df["4"])
file_start_times_df

Unnamed: 0,1,2,3,4,hup_id
0,2023-12-05 13:34:53,2023-12-05 07:00:30,2023-12-05 15:47:02,NaT,137
1,2023-12-05 11:11:18,2023-12-05 06:45:35,NaT,NaT,140
2,2023-12-05 14:48:06,2023-12-05 13:15:43,NaT,NaT,147
3,2023-12-05 13:08:49,2023-12-05 13:22:40,NaT,NaT,148
4,2023-12-05 11:37:25,2023-12-05 09:29:45,2023-12-05 06:44:49,2023-12-05 16:20:33,149
5,2023-12-05 13:17:47,2023-12-05 15:44:30,NaT,NaT,152
6,2023-12-05 12:11:56,2023-12-05 10:56:10,NaT,NaT,153
7,2023-12-05 11:54:29,2023-12-05 11:24:13,2023-12-05 10:18:45,NaT,156
8,2023-12-05 10:07:14,2023-12-05 07:00:27,NaT,NaT,159
9,2023-12-05 16:42:44,2023-12-05 07:06:50,NaT,NaT,167


In [5]:
print("Using Carlos session")
with open("agu_ieeglogin.bin", "r") as f:
    session = Session("aguilac", f.read())

Using Carlos session


In [6]:
# Create a new df called tuples_df
tuples_df = pd.DataFrame(
    columns=[
        "hup_id",
        "ds_1_start",
        "ds_1_end",
        "ds_2_start",
        "ds_2_end",
        "ds_3_start",
        "ds_3_end",
        "ds_4_start",
        "ds_4_end",
    ]
)

# Iterate through patients_df and only add the hup_id, leave the rest as nan
for index, row in patients_df.iterrows():
    # Get the hup_id
    hup_id = row["hup_id"]
    new_row = {
        "hup_id": hup_id,
    }
    # Add the new row to tuples_df by concatenating
    tuples_df = pd.concat([tuples_df, pd.DataFrame(new_row, index=[index])])
tuples_df

Unnamed: 0,hup_id,ds_1_start,ds_1_end,ds_2_start,ds_2_end,ds_3_start,ds_3_end,ds_4_start,ds_4_end
0,193,,,,,,,,
1,181,,,,,,,,
2,137,,,,,,,,
3,195,,,,,,,,
4,148,,,,,,,,
5,156,,,,,,,,
6,147,,,,,,,,
7,179,,,,,,,,
8,194,,,,,,,,
9,167,,,,,,,,


In [8]:
# Iterate through each patient using iterrows
for index, row in patients_df.iterrows():
    # Get the HUP ID
    hup_id = row["hup_id"]
    # Get the number of datasets
    num_datasets = row["num_datasets"]

    for ds_index in range(1, num_datasets + 1):
        dataset_name = f"HUP{hup_id}_phaseII_D0{ds_index}"
        dataset = session.open_dataset(dataset_name)
        duration_usec = dataset.get_time_series_details(
            dataset.get_channel_labels()[0]
        ).duration
        duration_sec = duration_usec / 1e6
        print(f"Duration of {dataset_name} is {duration_sec} seconds")
        # Get the value of file_start_times_df with hup_id and ds_index
        file_start_times_row = file_start_times_df.loc[
            file_start_times_df["hup_id"] == hup_id
        ]
        # Get the start times for each dataset
        ds_start = file_start_times_row[str(ds_index)].values[0]
        # Make ds_start datetime object
        ds_start = pd.to_datetime(ds_start)
        ds_end = ds_start + pd.Timedelta(seconds=duration_sec)
        # Change the value of tuples_df with hup_id == hup_id and ds_index == ds_index
        tuples_df.loc[
            (tuples_df["hup_id"] == hup_id), f"ds_{ds_index}_start"
        ] = ds_start
        tuples_df.loc[(tuples_df["hup_id"] == hup_id), f"ds_{ds_index}_end"] = ds_end

        # filename = f"HUP_{hup_id}_ds_{ds_index}.npy"
        # # Load the data as numpy array
        # data = np.load(os.path.join(SOURCE_DIRECTORY, filename))

Duration of HUP193_phaseII_D01 is 327798.203593 seconds
Duration of HUP193_phaseII_D02 is 272961.533718 seconds
Duration of HUP193_phaseII_D03 is 417640.008031 seconds
Duration of HUP181_phaseII_D01 is 329678.060546 seconds
Duration of HUP181_phaseII_D02 is 259166.205078 seconds


KeyboardInterrupt: 

In [10]:
ds_start

Timestamp('2023-12-04 16:20:33')

In [11]:
ds_end

Timestamp('2023-12-14 09:13:05.468750')

In [8]:
tuples_df

Unnamed: 0,hup_id,ds_1_start,ds_1_end,ds_2_start,ds_2_end,ds_3_start,ds_3_end,ds_4_start,ds_4_end
0,193,2023-12-04 15:20:14,2023-12-08 10:23:32.203593,2023-12-04 11:59:16,2023-12-07 15:48:37.533718,2023-12-04 10:44:24,2023-12-09 06:45:04.008031,,
1,181,2023-12-04 11:10:28,2023-12-08 06:45:06.060546,2023-12-04 06:45:36,2023-12-07 06:45:02.205078,,,,
2,137,2023-12-04 13:34:53,2023-12-08 07:00:00.933593,2023-12-04 07:00:30,2023-12-06 11:09:54.044921,2023-12-04 15:47:02,2023-12-06 14:58:52.664062,,
3,195,2023-12-04 13:00:16,2023-12-08 11:51:06.910250,2023-12-04 16:50:26,2023-12-05 10:44:22.825718,2023-12-04 10:53:12,2023-12-10 15:36:07.830218,,
4,148,2023-12-04 13:08:49,2023-12-05 13:20:39.703125,2023-12-04 13:22:40,2023-12-11 11:04:34.220703,,,,
5,156,2023-12-04 11:54:29,2023-12-07 11:19:46.066406,2023-12-04 11:24:13,2023-12-05 06:45:06.347656,2023-12-04 10:18:45,2023-12-07 09:30:15.478515,,
6,147,2023-12-04 14:48:06,2023-12-07 13:11:09.016601,2023-12-04 13:15:43,2023-12-08 23:54:50.429687,,,,
7,179,2023-12-04 12:52:12,2023-12-12 07:45:01.728515,2023-12-04 07:45:24,2023-12-07 07:45:02.099609,,,,
8,194,2023-12-04 10:57:54,2023-12-05 01:00:42.357421,2023-12-04 07:25:12,2023-12-09 21:15:50.583484,2023-12-04 21:22:46,2023-12-09 14:03:49.465031,,
9,167,2023-12-04 16:42:44,2023-12-09 07:00:03.210937,2023-12-04 07:06:50,2023-12-10 14:23:28.648437,,,,


In [37]:
# Iterate through each patient using iterrows
for index, row in patients_df.iterrows():
    # Get the HUP ID
    hup_id = row["hup_id"]
    # Get the number of datasets
    num_datasets = row["num_datasets"]

    for ds_index in range(2, num_datasets + 1):
        # Get the row of tuples_df with hup_id == hup_id
        tuples_row = tuples_df.loc[tuples_df["hup_id"] == hup_id]
        # Make all non-nan values datetime objects with only time

        prev_ds_end = tuples_row[f"ds_{ds_index-1}_end"].values[0]
        curr_ds_start = tuples_row[f"ds_{ds_index}_start"].values[0]
        # prev_ds_end has format Timestamp('2023-12-07 14:44:27.651367'), convert to Timestamp('2023-12-04 16:20:33')
        prev_ds_end = prev_ds_end.floor("s")
        curr_ds_start = curr_ds_start.floor("s")
        prev_ds_end_time = prev_ds_end.time()
        curr_ds_start_time = curr_ds_start.time()
        # subtract curr_ds_start_time from prev_ds_end_time, both are datetime.time objects
        time_diff = datetime.combine(
            date.today(), curr_ds_start_time
        ) - datetime.combine(date.today(), prev_ds_end_time)