In [1]:
import os
import numpy as np
import pandas as pd
from ieeg.auth import Session

from get_iEEG_data import *
from iEEG_helper_functions import *

In [2]:
# Load HUP_implant_dates.xlsx
nina_patients_df = pd.read_excel("../../Data/HUP_implant_dates.xlsx")
# Make the hup_id column integers
nina_patients_df["hup_id"] = nina_patients_df["hup_id"].astype(int)
# Drop the columns Implant_Date, implant_time, Explant_date, weight_kg
nina_patients_df.drop(
    columns=["Implant_Date", "implant_time", "Explant_Date", "weight_kg"], inplace=True
)
# Add a boolean column in nina_patients_df called is_single_dataset and make it True if IEEG_Portal_Number ends with "phaseII"
nina_patients_df["is_single_dataset"] = nina_patients_df[
    "IEEG_Portal_Number"
].str.endswith("phaseII")
# Only keep the rows in nina_patients_df where is_single_dataset is False
nina_patients_df = nina_patients_df[nina_patients_df["is_single_dataset"] == False]
# Add a colum called num_datasets and set it to the last character of IEEG_Portal_Number
nina_patients_df["num_datasets"] = nina_patients_df["IEEG_Portal_Number"].str[-1]
# Make the num_datasets column an integer
nina_patients_df["num_datasets"] = nina_patients_df["num_datasets"].astype(int)
# Sort by hup_id in ascending order
nina_patients_df.sort_values(by=["hup_id"], inplace=True)
# Reset index
nina_patients_df.reset_index(drop=True, inplace=True)
# Add a column called total_hours and set it to 0
nina_patients_df["total_hours"] = 0
# Add a column called sampling_rate and set it to np.nan
nina_patients_df["sampling_rate"] = np.nan
# Add a column called min_sampling_rate and set it to 0
nina_patients_df["min_sampling_rate"] = 0
# Add a colum called size_estimate and set it to 0
nina_patients_df["size_estimate"] = 0
nina_patients_df

Unnamed: 0,hup_id,IEEG_Portal_Number,is_single_dataset,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate
0,137,HUP137_phaseII_D01-D03,False,3,0,,0,0
1,140,HUP140_phaseII_D01-D02,False,2,0,,0,0
2,147,HUP147_phaseII_D01-D02,False,2,0,,0,0
3,148,HUP148_phaseII_D01-D02,False,2,0,,0,0
4,149,HUP149_phaseII_D01-D04,False,4,0,,0,0
5,152,HUP152_phaseII_D01-D02,False,2,0,,0,0
6,153,HUP153_phaseII_D01-D02,False,2,0,,0,0
7,156,HUP156_phaseII_D01-D03,False,3,0,,0,0
8,159,HUP159_phaseII_D01-D02,False,2,0,,0,0
9,167,HUP167_phaseII_D01-D02,False,2,0,,0,0


In [3]:
print("Using Carlos session")
with open("agu_ieeglogin.bin", "r") as f:
    session = Session("aguilac", f.read())

Using Carlos session


## Check if we can open every dataset

In [None]:
for index, row in nina_patients_df.iterrows():
    hup_id = row["hup_id"]
    num_datasets = row["num_datasets"]
    # Iterate through the datasets
    for i in range(1, num_datasets + 1):
        dataset_name = f"HUP{hup_id}_phaseII_D0{i}"
        dataset = session.open_dataset(dataset_name)
print("Successfully opened all datasets")

# Get information about size of these datasets

In [4]:
for index, row in nina_patients_df.iterrows():
    hup_id = row["hup_id"]
    num_datasets = row["num_datasets"]
    durations_hours = []
    sampling_rates = []
    # Iterate through the datasets
    for i in range(1, num_datasets + 1):
        dataset_name = f"HUP{hup_id}_phaseII_D0{i}"
        dataset = session.open_dataset(dataset_name)
        labels = dataset.get_channel_labels()
        duration_hours = (
            dataset.get_time_series_details(labels[0]).duration / 1e6 / 3600
        )
        durations_hours.append(duration_hours)
        sampling_rate = int(dataset.get_time_series_details(labels[0]).sample_rate)
        sampling_rates.append(sampling_rate)
    # Set the sampling_rate column to the first element of sampling_rates
    nina_patients_df.loc[index, "sampling_rate"] = str(sampling_rates)
    # Set the column size_estimate to the dot product of durations_hours and sampling_rates
    nina_patients_df.loc[index, "size_estimate"] = np.dot(
        durations_hours, sampling_rates
    )
    # Set the column total_hours to the sum of durations_hours
    nina_patients_df.loc[index, "total_hours"] = sum(durations_hours)
    nina_patients_df.loc[index, "min_sampling_rate"] = min(sampling_rates)

In [7]:
# Drop IEEG_Portal_Number, is_single_dataset
nina_patients_df.drop(columns=["IEEG_Portal_Number", "is_single_dataset"], inplace=True)
nina_patients_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate
0,137,3,188.772956,"[512, 512, 256]",256,84569.2175
1,140,2,148.518412,"[1024, 1024]",1024,152082.854166
2,147,2,177.036235,"[1024, 512]",512,126679.248055
3,148,2,189.895812,"[1024, 512]",512,109615.733611
4,149,4,523.779698,"[512, 1024, 1024, 1024]",512,525169.350833
5,152,2,385.824318,"[512, 512]",512,197542.050833
6,153,2,305.785224,"[512, 1024]",512,227762.6725
7,156,3,161.961359,"[1024, 512, 512]",512,119491.976389
8,159,2,291.381685,"[512, 512]",512,149187.422778
9,167,2,261.566072,"[512, 512]",512,133921.828889


## Assign each patient into 1 of 4 batches to ensure each batch has equal size estimate

In [8]:
def assign_batches(df, column):
    # Sort by the specified column
    sorted_df = df.sort_values(by=column)

    # Calculate total and target size for each batch
    total_size = sorted_df[column].sum()
    target_per_batch = total_size / 4

    # Initialize batch column
    sorted_df["batch"] = 0
    current_batch = 1
    current_sum = 0

    # Iteratively assign batch numbers
    for index, row in sorted_df.iterrows():
        if current_sum + row[column] > target_per_batch and current_batch < 4:
            current_batch += 1
            current_sum = 0
        sorted_df.at[index, "batch"] = current_batch
        current_sum += row[column]

    return sorted_df


# Use the function to assign batch numbers
nina_patients_df = assign_batches(nina_patients_df, "size_estimate")

In [14]:
nina_patients_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,batch
13,193,3,282.888818,"[256, 256, 256]",256,72419.537447,1
12,181,2,163.567852,"[512, 512]",512,83746.74,1
0,137,3,188.772956,"[512, 512, 256]",256,84569.2175,1
15,195,3,261.462102,"[256, 256, 512]",256,105005.468189,1
3,148,2,189.895812,"[1024, 512]",512,109615.733611,1
7,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,1
2,147,2,177.036235,"[1024, 512]",512,126679.248055,1
11,179,2,258.874397,"[512, 512]",512,132543.691111,1
14,194,3,260.575113,"[512, 512, 512]",512,133414.457733,1
9,167,2,261.566072,"[512, 512]",512,133921.828889,1
