In [1]:
import os
import numpy as np
import pandas as pd
from ieeg.auth import Session
from scipy.signal import resample_poly

from get_iEEG_data import *
from iEEG_helper_functions import *

In [2]:
SYNCHRONY_BROADBAND_MULTI_DS_DIRECTORY = (
    "../../Data/synchrony/all/broadband_multi_dataset"
)

In [3]:
# Load ../../Data/multi_dataset_batches.csv as a pandas dataframe
df = pd.read_csv("../../Data/multi_dataset_batches.csv")
# add a new column called has_multi_sampling_rate
df["has_multi_sampling_rate"] = False
df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,batch,has_multi_sampling_rate
0,193,3,282.888818,"[256, 256, 256]",256,72419.537447,1,False
1,181,2,163.567852,"[512, 512]",512,83746.74,1,False
2,137,3,188.772956,"[512, 512, 256]",256,84569.2175,1,False
3,195,3,261.462102,"[256, 256, 512]",256,105005.468189,1,False
4,148,2,189.895812,"[1024, 512]",512,109615.733611,1,False
5,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,1,False
6,147,2,177.036235,"[1024, 512]",512,126679.248055,1,False
7,179,2,258.874397,"[512, 512]",512,132543.691111,1,False
8,194,3,260.575113,"[512, 512, 512]",512,133414.457733,1,False
9,167,2,261.566072,"[512, 512]",512,133921.828889,1,False


In [4]:
# Implement a function that takes in a stringified list of integers and returns a list of integers
def string_to_list(string):
    """
    Convert a stringified list of integers to a list of integers

    Parameters
    ----------
    string: str
        A stringified list of integers

    Returns
    -------
    list
        A list of integers
    """
    # YOUR CODE HERE
    return [int(x) for x in string.strip("[]").split(",")]

In [5]:
# for each row in the dataframe, load sampling_rate into a list. if there are multiple unique sampling rates, mark has_multi_sampling_rate as True
for index, row in df.iterrows():
    sampling_rate = string_to_list(row["sampling_rate"])
    if len(set(sampling_rate)) > 1:
        df.at[index, "has_multi_sampling_rate"] = True

In [6]:
# only keep rows with has_multi_sampling_rate as False
multiple_sample_rate_df = df[df["has_multi_sampling_rate"] == True]
# Reset index
multiple_sample_rate_df = multiple_sample_rate_df.reset_index(drop=True)
# Drop the batch column
multiple_sample_rate_df = multiple_sample_rate_df.drop(columns=["batch"])
multiple_sample_rate_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,has_multi_sampling_rate
0,137,3,188.772956,"[512, 512, 256]",256,84569.2175,True
1,195,3,261.462102,"[256, 256, 512]",256,105005.468189,True
2,148,2,189.895812,"[1024, 512]",512,109615.733611,True
3,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,True
4,147,2,177.036235,"[1024, 512]",512,126679.248055,True
5,214,2,311.207215,"[1024, 512]",512,206271.032951,True
6,153,2,305.785224,"[512, 1024]",512,227762.6725,True
7,213,2,558.426826,"[1024, 512]",512,369247.993412,True
8,215,4,235.29497,"[2048, 1024, 2048, 1024]",1024,412838.037587,True
9,149,4,523.779698,"[512, 1024, 1024, 1024]",512,525169.350833,True


In [7]:
def assign_batches(df, column):
    # Sort by the specified column
    sorted_df = df.sort_values(by=column)

    # Calculate total and target size for each batch
    total_size = sorted_df[column].sum()
    target_per_batch = total_size / 4

    # Initialize batch column
    sorted_df["batch"] = 0
    current_batch = 1
    current_sum = 0

    # Iteratively assign batch numbers
    for index, row in sorted_df.iterrows():
        if current_sum + row[column] > target_per_batch and current_batch < 4:
            current_batch += 1
            current_sum = 0
        sorted_df.at[index, "batch"] = current_batch
        current_sum += row[column]

    return sorted_df


# Assign batches to multiple_sample_rate_df
multiple_sample_rate_df = assign_batches(multiple_sample_rate_df, "size_estimate")

In [8]:
multiple_sample_rate_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,has_multi_sampling_rate,batch
0,137,3,188.772956,"[512, 512, 256]",256,84569.2175,True,1
1,195,3,261.462102,"[256, 256, 512]",256,105005.468189,True,1
2,148,2,189.895812,"[1024, 512]",512,109615.733611,True,1
3,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,True,1
4,147,2,177.036235,"[1024, 512]",512,126679.248055,True,1
5,214,2,311.207215,"[1024, 512]",512,206271.032951,True,2
6,153,2,305.785224,"[512, 1024]",512,227762.6725,True,2
7,213,2,558.426826,"[1024, 512]",512,369247.993412,True,3
8,215,4,235.29497,"[2048, 1024, 2048, 1024]",1024,412838.037587,True,4
9,149,4,523.779698,"[512, 1024, 1024, 1024]",512,525169.350833,True,4


In [None]:
print("Using Carlos session")
with open("agu_ieeglogin.bin", "r") as f:
    session = Session("aguilac", f.read())

In [None]:
batch = multiple_sample_rate_df[multiple_sample_rate_df["batch"] == 1].reset_index(
    drop=True
)
batch

In [None]:
for index, row in batch.iterrows():
    hup_id = row["hup_id"]
    num_datasets = row["num_datasets"]
    print(f"HUP {hup_id} has {num_datasets} datasets")
    min_sampling_rate = int(row["min_sampling_rate"])
    for ds_index in range(1, num_datasets + 1):
        # Check if the file with name f"HUP_{hup_id}_ds_{ds_index}.npy" in SYNCHRONY_BROADBAND_MULTI_DS_DIRECTORY exists
        if os.path.exists(
            os.path.join(
                SYNCHRONY_BROADBAND_MULTI_DS_DIRECTORY,
                f"HUP_{hup_id}_ds_{ds_index}.npy",
            )
        ):
            print(f"HUP_{hup_id}_ds_{ds_index}.npy exists, skip...")
            continue
        dataset_name = f"HUP{hup_id}_phaseII_D0{ds_index}"
        dataset = session.open_dataset(dataset_name)

        all_channel_labels = np.array(dataset.get_channel_labels())
        channel_labels_to_download = all_channel_labels[
            electrode_selection(all_channel_labels)
        ]

        duration_usec = dataset.get_time_series_details(
            channel_labels_to_download[0]
        ).duration
        duration_hours = int(duration_usec / 1000000 / 60 / 60)
        enlarged_duration_hours = duration_hours + 24

        print(f"Opening {dataset_name} with duration {duration_hours} hours")

        # Calculate the total number of 2-minute intervals in the enlarged duration
        total_intervals = enlarged_duration_hours * 30  # 60min/hour / 2min = 30

        synchrony_broadband_vector_to_save = np.full(total_intervals, np.nan)

        # Loop through each 2-minute interval
        for interval in range(total_intervals):
            print(f"Getting iEEG data for interval {interval} out of {total_intervals}")
            duration_usec = 1.2e8  # 2 minutes
            start_time_usec = interval * 2 * 60 * 1e6  # 2 minutes in microseconds
            stop_time_usec = start_time_usec + duration_usec

            try:
                ieeg_data, fs = get_iEEG_data(
                    "aguilac",
                    "agu_ieeglogin.bin",
                    dataset_name,
                    start_time_usec,
                    stop_time_usec,
                    channel_labels_to_download,
                )
                fs = int(fs)
            except Exception as e:
                # handle the exception
                print(f"Error: {e}")
                break

            # Drop rows that has any nan
            ieeg_data = ieeg_data.dropna(axis=0, how="any")
            if ieeg_data.empty:
                print("Empty dataframe after dropping nan, skip...")
                continue

            good_channels_res = detect_bad_channels_optimized(ieeg_data.to_numpy(), fs)
            good_channel_indicies = good_channels_res[0]
            good_channel_labels = channel_labels_to_download[good_channel_indicies]
            ieeg_data = ieeg_data[good_channel_labels].to_numpy()

            if fs > min_sampling_rate:
                up = min_sampling_rate  # Upsampling rate
                down = fs  # Downsampling rate
                ieeg_data = resample_poly(ieeg_data, up, down, axis=1)
                fs = min_sampling_rate

            # Check if ieeg_data is empty after dropping bad channels
            if ieeg_data.size == 0:
                print("Empty dataframe after dropping bad channels, skip...")
                continue

            ieeg_data = common_average_montage(ieeg_data)

            # Apply the filters directly on the DataFrame
            ieeg_data = notch_filter(ieeg_data, 59, 61, min_sampling_rate)

            ##############################
            # Calculate synchrony (broadband)
            ##############################
            _, R = calculate_synchrony(ieeg_data.T)
            synchrony_broadband_vector_to_save[interval] = R

            print(f"Finished calculating synchrony for interval {interval}")

        ##############################
        # Save the synchrony output
        ##############################
        np.save(
            os.path.join(
                SYNCHRONY_BROADBAND_MULTI_DS_DIRECTORY,
                f"HUP_{hup_id}_ds_{ds_index}.npy",
            ),
            synchrony_broadband_vector_to_save,
        )
        print(f"Saved HUP_{hup_id}_ds_{ds_index}.npy")

In [1]:
!jupyter nbconvert --to python multi_ds_multi_hz_download.ipynb

[NbConvertApp] Converting notebook multi_ds_multi_hz_download.ipynb to python
[NbConvertApp] Writing 7157 bytes to multi_ds_multi_hz_download.py
